From 52cd9974be908bf693832012e56e945e9e34f389 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Mon, 15 Apr 2024 20:40:43 +0000 Subject: [PATCH 001/300] [clang] Add flag to experiment with cold function attributes To be removed and promoted to a proper driver flag if experiments turn out fruitful. Original LLVM patch for this functionality: #69030 --- clang/lib/CodeGen/BackendUtil.cpp | 57 +++++++++++++++++++------------ 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 6cc00b85664f41..22c3f8642ad8eb 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -104,6 +104,21 @@ static cl::opt ClSanitizeOnOptimizerEarlyEP( "sanitizer-early-opt-ep", cl::Optional, cl::desc("Insert sanitizers on OptimizerEarlyEP.")); +// Experiment to mark cold functions as optsize/minsize/optnone. +// TODO: remove once this is exposed as a proper driver flag. +static cl::opt ClPGOColdFuncAttr( + "pgo-cold-func-opt", cl::init(PGOOptions::ColdFuncOpt::Default), cl::Hidden, + cl::desc( + "Function attribute to apply to cold functions as determined by PGO"), + cl::values(clEnumValN(PGOOptions::ColdFuncOpt::Default, "default", + "Default (no attribute)"), + clEnumValN(PGOOptions::ColdFuncOpt::OptSize, "optsize", + "Mark cold functions with optsize."), + clEnumValN(PGOOptions::ColdFuncOpt::MinSize, "minsize", + "Mark cold functions with minsize."), + clEnumValN(PGOOptions::ColdFuncOpt::OptNone, "optnone", + "Mark cold functions with optnone."))); + extern cl::opt ProfileCorrelate; // Re-link builtin bitcodes after optimization @@ -768,42 +783,41 @@ void EmitAssemblyHelper::RunOptimizationPipeline( CodeGenOpts.InstrProfileOutput.empty() ? getDefaultProfileGenName() : CodeGenOpts.InstrProfileOutput, "", "", CodeGenOpts.MemoryProfileUsePath, nullptr, PGOOptions::IRInstr, - PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default, + PGOOptions::NoCSAction, ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, /*PseudoProbeForProfiling=*/false, CodeGenOpts.AtomicProfileUpdate); else if (CodeGenOpts.hasProfileIRUse()) { // -fprofile-use. auto CSAction = CodeGenOpts.hasProfileCSIRUse() ? PGOOptions::CSIRUse : PGOOptions::NoCSAction; - PGOOpt = PGOOptions( - CodeGenOpts.ProfileInstrumentUsePath, "", - CodeGenOpts.ProfileRemappingFile, CodeGenOpts.MemoryProfileUsePath, VFS, - PGOOptions::IRUse, CSAction, PGOOptions::ColdFuncOpt::Default, - CodeGenOpts.DebugInfoForProfiling); + PGOOpt = PGOOptions(CodeGenOpts.ProfileInstrumentUsePath, "", + CodeGenOpts.ProfileRemappingFile, + CodeGenOpts.MemoryProfileUsePath, VFS, + PGOOptions::IRUse, CSAction, ClPGOColdFuncAttr, + CodeGenOpts.DebugInfoForProfiling); } else if (!CodeGenOpts.SampleProfileFile.empty()) // -fprofile-sample-use PGOOpt = PGOOptions( CodeGenOpts.SampleProfileFile, "", CodeGenOpts.ProfileRemappingFile, CodeGenOpts.MemoryProfileUsePath, VFS, PGOOptions::SampleUse, - PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default, + PGOOptions::NoCSAction, ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, CodeGenOpts.PseudoProbeForProfiling); else if (!CodeGenOpts.MemoryProfileUsePath.empty()) // -fmemory-profile-use (without any of the above options) PGOOpt = PGOOptions("", "", "", CodeGenOpts.MemoryProfileUsePath, VFS, PGOOptions::NoAction, PGOOptions::NoCSAction, - PGOOptions::ColdFuncOpt::Default, - CodeGenOpts.DebugInfoForProfiling); + ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling); else if (CodeGenOpts.PseudoProbeForProfiling) // -fpseudo-probe-for-profiling - PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr, - PGOOptions::NoAction, PGOOptions::NoCSAction, - PGOOptions::ColdFuncOpt::Default, - CodeGenOpts.DebugInfoForProfiling, true); + PGOOpt = + PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr, + PGOOptions::NoAction, PGOOptions::NoCSAction, + ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, true); else if (CodeGenOpts.DebugInfoForProfiling) // -fdebug-info-for-profiling PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr, PGOOptions::NoAction, PGOOptions::NoCSAction, - PGOOptions::ColdFuncOpt::Default, true); + ClPGOColdFuncAttr, true); // Check to see if we want to generate a CS profile. if (CodeGenOpts.hasProfileCSIRInstr()) { @@ -820,14 +834,13 @@ void EmitAssemblyHelper::RunOptimizationPipeline( : CodeGenOpts.InstrProfileOutput; PGOOpt->CSAction = PGOOptions::CSIRInstr; } else - PGOOpt = - PGOOptions("", - CodeGenOpts.InstrProfileOutput.empty() - ? getDefaultProfileGenName() - : CodeGenOpts.InstrProfileOutput, - "", /*MemoryProfile=*/"", nullptr, PGOOptions::NoAction, - PGOOptions::CSIRInstr, PGOOptions::ColdFuncOpt::Default, - CodeGenOpts.DebugInfoForProfiling); + PGOOpt = PGOOptions("", + CodeGenOpts.InstrProfileOutput.empty() + ? getDefaultProfileGenName() + : CodeGenOpts.InstrProfileOutput, + "", /*MemoryProfile=*/"", nullptr, + PGOOptions::NoAction, PGOOptions::CSIRInstr, + ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling); } if (TM) TM->setPGOOption(PGOOpt); From 69d861e1320119e9a02907155ca626b1db90ad93 Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Mon, 15 Apr 2024 16:55:58 -0400 Subject: [PATCH 002/300] [clang] Move tailclipping to bitfield allocation (#87090) Move bitfield access clipping to bitfield access computation. --- clang/lib/CodeGen/CGRecordLayoutBuilder.cpp | 84 +++++++++++---------- clang/test/CodeGen/bitfield-access-unit.c | 18 +++++ 2 files changed, 62 insertions(+), 40 deletions(-) diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp index 634a55fec5182e..868b1ab98e048a 100644 --- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp +++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp @@ -41,10 +41,11 @@ namespace { /// contains enough information to determine where the runs break. Microsoft /// and Itanium follow different rules and use different codepaths. /// * It is desired that, when possible, bitfields use the appropriate iN type -/// when lowered to llvm types. For example unsigned x : 24 gets lowered to +/// when lowered to llvm types. For example unsigned x : 24 gets lowered to /// i24. This isn't always possible because i24 has storage size of 32 bit -/// and if it is possible to use that extra byte of padding we must use -/// [i8 x 3] instead of i24. The function clipTailPadding does this. +/// and if it is possible to use that extra byte of padding we must use [i8 x +/// 3] instead of i24. This is computed when accumulating bitfields in +/// accumulateBitfields. /// C++ examples that require clipping: /// struct { int a : 24; char b; }; // a must be clipped, b goes at offset 3 /// struct A { int a : 24; ~A(); }; // a must be clipped because: @@ -62,11 +63,7 @@ namespace { /// that the tail padding is not used in the complete class.) However, /// because LLVM reads from the complete type it can generate incorrect code /// if we do not clip the tail padding off of the bitfield in the complete -/// layout. This introduces a somewhat awkward extra unnecessary clip stage. -/// The location of the clip is stored internally as a sentinel of type -/// SCISSOR. If LLVM were updated to read base types (which it probably -/// should because locations of things such as VBases are bogus in the llvm -/// type anyway) then we could eliminate the SCISSOR. +/// layout. /// * Itanium allows nearly empty primary virtual bases. These bases don't get /// get their own storage because they're laid out as part of another base /// or at the beginning of the structure. Determining if a VBase actually @@ -200,9 +197,7 @@ struct CGRecordLowering { const CXXRecordDecl *Query) const; void calculateZeroInit(); CharUnits calculateTailClippingOffset(bool isNonVirtualBaseType) const; - /// Lowers bitfield storage types to I8 arrays for bitfields with tail - /// padding that is or can potentially be used. - void clipTailPadding(); + void checkBitfieldClipping() const; /// Determines if we need a packed llvm struct. void determinePacked(bool NVBaseType); /// Inserts padding everywhere it's needed. @@ -305,7 +300,7 @@ void CGRecordLowering::lower(bool NVBaseType) { } llvm::stable_sort(Members); Members.push_back(StorageInfo(Size, getIntNType(8))); - clipTailPadding(); + checkBitfieldClipping(); determinePacked(NVBaseType); insertPadding(); Members.pop_back(); @@ -531,6 +526,7 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType, // available padding characters. RecordDecl::field_iterator BestEnd = Begin; CharUnits BestEndOffset; + bool BestClipped; // Whether the representation must be in a byte array. for (;;) { // AtAlignedBoundary is true iff Field is the (potential) start of a new @@ -593,10 +589,9 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType, // this is the best seen so far. BestEnd = Field; BestEndOffset = BeginOffset + AccessSize; - if (Types.getCodeGenOpts().FineGrainedBitfieldAccesses) - // Fine-grained access, so no merging of spans. - InstallBest = true; - else if (!BitSizeSinceBegin) + // Assume clipped until proven not below. + BestClipped = true; + if (!BitSizeSinceBegin) // A zero-sized initial span -- this will install nothing and reset // for another. InstallBest = true; @@ -624,6 +619,12 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType, // The access unit is not at a naturally aligned offset within the // structure. InstallBest = true; + + if (InstallBest && BestEnd == Field) + // We're installing the first span, whose clipping was presumed + // above. Compute it correctly. + if (getSize(Type) == AccessSize) + BestClipped = false; } if (!InstallBest) { @@ -656,11 +657,15 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType, // access unit. BestEndOffset = BeginOffset + TypeSize; BestEnd = Field; + BestClipped = false; } if (Barrier) // The next field is a barrier that we cannot merge across. InstallBest = true; + else if (Types.getCodeGenOpts().FineGrainedBitfieldAccesses) + // Fine-grained access, so no merging of spans. + InstallBest = true; else // Otherwise, we're not installing. Update the bit size // of the current span to go all the way to LimitOffset, which is @@ -679,7 +684,17 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType, // Add the storage member for the access unit to the record. The // bitfields get the offset of their storage but come afterward and // remain there after a stable sort. - llvm::Type *Type = getIntNType(Context.toBits(AccessSize)); + llvm::Type *Type; + if (BestClipped) { + assert(getSize(getIntNType(Context.toBits(AccessSize))) > + AccessSize && + "Clipped access need not be clipped"); + Type = getByteArrayType(AccessSize); + } else { + Type = getIntNType(Context.toBits(AccessSize)); + assert(getSize(Type) == AccessSize && + "Unclipped access must be clipped"); + } Members.push_back(StorageInfo(BeginOffset, Type)); for (; Begin != BestEnd; ++Begin) if (!Begin->isZeroLengthBitField(Context)) @@ -934,32 +949,21 @@ void CGRecordLowering::calculateZeroInit() { } } -void CGRecordLowering::clipTailPadding() { - std::vector::iterator Prior = Members.begin(); - CharUnits Tail = getSize(Prior->Data); - for (std::vector::iterator Member = Prior + 1, - MemberEnd = Members.end(); - Member != MemberEnd; ++Member) { +// Verify accumulateBitfields computed the correct storage representations. +void CGRecordLowering::checkBitfieldClipping() const { +#ifndef NDEBUG + auto Tail = CharUnits::Zero(); + for (const auto &M : Members) { // Only members with data and the scissor can cut into tail padding. - if (!Member->Data && Member->Kind != MemberInfo::Scissor) + if (!M.Data && M.Kind != MemberInfo::Scissor) continue; - if (Member->Offset < Tail) { - assert(Prior->Kind == MemberInfo::Field && - "Only storage fields have tail padding!"); - if (!Prior->FD || Prior->FD->isBitField()) - Prior->Data = getByteArrayType(bitsToCharUnits(llvm::alignTo( - cast(Prior->Data)->getIntegerBitWidth(), 8))); - else { - assert(Prior->FD->hasAttr() && - "should not have reused this field's tail padding"); - Prior->Data = getByteArrayType( - Context.getTypeInfoDataSizeInChars(Prior->FD->getType()).Width); - } - } - if (Member->Data) - Prior = Member; - Tail = Prior->Offset + getSize(Prior->Data); + + assert(M.Offset >= Tail && "Bitfield access unit is not clipped"); + Tail = M.Offset; + if (M.Data) + Tail += getSize(M.Data); } +#endif } void CGRecordLowering::determinePacked(bool NVBaseType) { diff --git a/clang/test/CodeGen/bitfield-access-unit.c b/clang/test/CodeGen/bitfield-access-unit.c index 1aed2e7202fc65..d0553c5183eeff 100644 --- a/clang/test/CodeGen/bitfield-access-unit.c +++ b/clang/test/CodeGen/bitfield-access-unit.c @@ -222,6 +222,24 @@ struct G { // LAYOUT-DWN32-NEXT: +struct __attribute__((aligned(8))) H { + char a; + unsigned b : 24; // on expensive alignment we want this to stay 24 + unsigned c __attribute__((aligned(8))); // Think 'long long' or lp64 ptr +} h; +// CHECK-LABEL: LLVMType:%struct.H = +// LAYOUT-FLEX-SAME: type <{ i8, i32, [3 x i8], i32, [4 x i8] }> +// LAYOUT-STRICT-SAME: type { i8, [3 x i8], [4 x i8], i32, [4 x i8] } +// LAYOUT-DWN32-FLEX-SAME: type <{ i8, i32, [3 x i8], i32, [4 x i8] }> +// LAYOUT-DWN32-STRICT-SAME: type { i8, [3 x i8], [4 x i8], i32, [4 x i8] } +// CHECK: BitFields:[ +// LAYOUT-FLEX-NEXT: + #if _LP64 struct A64 { int a : 16; From c303945409a740c8fdb4103a4f21df55187aa84f Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 15 Apr 2024 17:01:28 -0400 Subject: [PATCH 003/300] [gn] port e356f68 more --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 4383f1d6d18ff2..865a79b63cd848 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -988,6 +988,7 @@ if (current_toolchain == default_toolchain) { "errno.h", "exception", "execution", + "expected", "experimental/__config", "experimental/__simd/aligned_tag.h", "experimental/__simd/declaration.h", From a855eea7fe86ef09a87f6251b3b711b821ae32bf Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 15 Apr 2024 14:05:01 -0700 Subject: [PATCH 004/300] [lldb] Fix the standalone Xcode build after #88317 In #88317, the clang resource headers was converted to an interface library. Update LLDB and fix the Xcode standalone build. Thanks Evan for the help! --- lldb/cmake/modules/LLDBFramework.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake index 81fc596ef4244e..f915839f6b45a5 100644 --- a/lldb/cmake/modules/LLDBFramework.cmake +++ b/lldb/cmake/modules/LLDBFramework.cmake @@ -119,7 +119,7 @@ add_custom_command(TARGET liblldb POST_BUILD if(NOT APPLE_EMBEDDED) if (TARGET clang-resource-headers) add_dependencies(liblldb clang-resource-headers) - set(clang_resource_headers_dir $) + set(clang_resource_headers_dir $) else() set(clang_resource_headers_dir ${LLDB_EXTERNAL_CLANG_RESOURCE_DIR}/include) if(NOT EXISTS ${clang_resource_headers_dir}) From 21d177096f84c38cf434c21bd3ff0dbd2ca163d0 Mon Sep 17 00:00:00 2001 From: Kai Nacke Date: Mon, 15 Apr 2024 17:12:25 -0400 Subject: [PATCH 005/300] [NFC] Refactor looping over recomputeLiveIns into function (#88040) https://github.com/llvm/llvm-project/pull/79940 put calls to recomputeLiveIns into a loop, to repeatedly call the function until the computation converges. However, this repeats a lot of code. This changes moves the loop into a function to simplify the handling. Note that this changes the order in which recomputeLiveIns is called. For example, ``` bool anyChange = false; do { anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB); } while (anyChange); ``` only begins to recompute the live-ins for LoopMBB after the computation for ExitMBB has converged. With this change, all basic blocks have a recomputation of the live-ins for each loop iteration. This can result in less or more calls, depending on the situation. --- llvm/include/llvm/CodeGen/LivePhysRegs.h | 18 ++++++++++++++++++ llvm/lib/CodeGen/BranchFolding.cpp | 8 ++------ .../Target/AArch64/AArch64FrameLowering.cpp | 5 +---- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 11 ++--------- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp | 8 +------- .../PowerPC/PPCExpandAtomicPseudoInsts.cpp | 11 ++--------- llvm/lib/Target/PowerPC/PPCFrameLowering.cpp | 11 ++--------- .../Target/SystemZ/SystemZFrameLowering.cpp | 10 ++-------- llvm/lib/Target/X86/X86FrameLowering.cpp | 11 ++--------- 9 files changed, 32 insertions(+), 61 deletions(-) diff --git a/llvm/include/llvm/CodeGen/LivePhysRegs.h b/llvm/include/llvm/CodeGen/LivePhysRegs.h index 1d40b1cbb0eaa3..9574a6f0c7c005 100644 --- a/llvm/include/llvm/CodeGen/LivePhysRegs.h +++ b/llvm/include/llvm/CodeGen/LivePhysRegs.h @@ -39,6 +39,8 @@ namespace llvm { +template class ArrayRef; + class MachineInstr; class MachineFunction; class MachineOperand; @@ -207,6 +209,22 @@ static inline bool recomputeLiveIns(MachineBasicBlock &MBB) { return oldLiveIns != newLiveIns; } +/// Convenience function for recomputing live-in's for a set of MBBs until the +/// computation converges. +inline void fullyRecomputeLiveIns(ArrayRef MBBs) { + MachineBasicBlock *const *Data = MBBs.data(); + const size_t Len = MBBs.size(); + while (true) { + bool AnyChange = false; + for (size_t I = 0; I < Len; ++I) + if (recomputeLiveIns(*Data[I])) + AnyChange = true; + if (!AnyChange) + return; + } +} + + } // end namespace llvm #endif // LLVM_CODEGEN_LIVEPHYSREGS_H diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index ecf7bc30913f51..55aa1d438b2a66 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -2047,12 +2047,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { MBB->splice(Loc, TBB, TBB->begin(), TIB); FBB->erase(FBB->begin(), FIB); - if (UpdateLiveIns) { - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*TBB) || recomputeLiveIns(*FBB); - } while (anyChange); - } + if (UpdateLiveIns) + fullyRecomputeLiveIns({TBB, FBB}); ++NumHoist; return true; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 5cc612e89162af..419c141121c325 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -4325,10 +4325,7 @@ AArch64FrameLowering::inlineStackProbeLoopExactMultiple( ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); MBB.addSuccessor(LoopMBB); // Update liveins. - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB); - } while (anyChange); + fullyRecomputeLiveIns({ExitMBB, LoopMBB}); return ExitMBB->begin(); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 92647cb405252f..9518d573bccdd1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -9556,15 +9556,8 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, MBB.addSuccessor(LoopTestMBB); // Update liveins. - if (MF.getRegInfo().reservedRegsFrozen()) { - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*ExitMBB) || - recomputeLiveIns(*LoopBodyMBB) || - recomputeLiveIns(*LoopTestMBB); - } while (anyChange); - ; - } + if (MF.getRegInfo().reservedRegsFrozen()) + fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB}); return ExitMBB->begin(); } diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 8629551152cb64..ea5dd5427ce720 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1806,13 +1806,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { PostOrderLoopTraversal DFS(LoLoop.ML, *MLI); DFS.ProcessLoop(); const SmallVectorImpl &PostOrder = DFS.getOrder(); - bool anyChange = false; - do { - anyChange = false; - for (auto *MBB : PostOrder) { - anyChange = recomputeLiveIns(*MBB) || anyChange; - } - } while (anyChange); + fullyRecomputeLiveIns(PostOrder); for (auto *MBB : reverse(PostOrder)) recomputeLivenessFlags(*MBB); diff --git a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp index b43eee8fdd8c0f..b3cfcb2aa14405 100644 --- a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp @@ -208,10 +208,7 @@ bool PPCExpandAtomicPseudo::expandAtomicRMW128( .addMBB(LoopMBB); CurrentMBB->addSuccessor(LoopMBB); CurrentMBB->addSuccessor(ExitMBB); - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB); - } while (anyChange); + fullyRecomputeLiveIns({ExitMBB, LoopMBB}); NMBBI = MBB.end(); MI.eraseFromParent(); return true; @@ -288,11 +285,7 @@ bool PPCExpandAtomicPseudo::expandAtomicCmpSwap128( CurrentMBB->addSuccessor(LoopCmpMBB); CurrentMBB->addSuccessor(ExitMBB); - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*CmpSuccMBB) || - recomputeLiveIns(*LoopCmpMBB); - } while (anyChange); + fullyRecomputeLiveIns({ExitMBB, CmpSuccMBB, LoopCmpMBB}); NMBBI = MBB.end(); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 6dcb59a3a57f85..04e9f9e2366edd 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1435,11 +1435,7 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF, ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB); } // Update liveins. - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*ProbeExitMBB) || - recomputeLiveIns(*ProbeLoopBodyMBB); - } while (anyChange); + fullyRecomputeLiveIns({ProbeExitMBB, ProbeLoopBodyMBB}); return ProbeExitMBB; }; // For case HasBP && MaxAlign > 1, we have to realign the SP by performing @@ -1531,10 +1527,7 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF, buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg); } // Update liveins. - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB); - } while (anyChange); + fullyRecomputeLiveIns({ExitMBB, LoopMBB}); } } ++NumPrologProbed; diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 4897b37d8eb1ef..50ecd6e0744147 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -824,10 +824,7 @@ void SystemZELFFrameLowering::inlineStackProbe( StackAllocMI->eraseFromParent(); if (DoneMBB != nullptr) { // Compute the live-in lists for the new blocks. - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*DoneMBB) || recomputeLiveIns(*LoopMBB); - } while (anyChange); + fullyRecomputeLiveIns({DoneMBB, LoopMBB}); } } @@ -1425,10 +1422,7 @@ void SystemZXPLINKFrameLowering::inlineStackProbe( StackAllocMI->eraseFromParent(); // Compute the live-in lists for the new blocks. - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*StackExtMBB) || recomputeLiveIns(*NextMBB); - } while (anyChange); + fullyRecomputeLiveIns({StackExtMBB, NextMBB}); } bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const { diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index d914e1b61ab075..4521401d8741c7 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -885,10 +885,7 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( } // Update Live In information - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*tailMBB) || recomputeLiveIns(*testMBB); - } while (anyChange); + fullyRecomputeLiveIns({tailMBB, testMBB}); } void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64( @@ -1380,11 +1377,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, footMBB->addSuccessor(&MBB); } - bool anyChange = false; - do { - anyChange = recomputeLiveIns(*footMBB) || recomputeLiveIns(*bodyMBB) || - recomputeLiveIns(*headMBB) || recomputeLiveIns(MBB); - } while (anyChange); + fullyRecomputeLiveIns({footMBB, bodyMBB, headMBB, &MBB}); } } else { MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) From 67571ffd2c51a72a23d57fb5ef746a6fadd6b09c Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 15 Apr 2024 15:00:30 -0700 Subject: [PATCH 006/300] [test][sanitizer] Compile .c file as C --- .../test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c index b14ac7bcf1924f..a6aed77d5691b8 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c @@ -1,4 +1,4 @@ -// RUN: %clangxx %s -pie -fPIE -o %t && %run %t +// RUN: %clang %s -pie -fPIE -o %t && %run %t // REQUIRES: x86_64-target-arch #include From a1ed652fb3589670180c08c2c6d5ef1ff337c658 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Mon, 15 Apr 2024 15:01:09 -0700 Subject: [PATCH 007/300] [test][sanitizer] Temporarily disable test Test, as expected, fails with Asan on system with 5lvl page tables. Disabling the test to migrate buildbot. --- .../test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c index a6aed77d5691b8..02220cb78e6c99 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c @@ -1,6 +1,9 @@ // RUN: %clang %s -pie -fPIE -o %t && %run %t // REQUIRES: x86_64-target-arch +// FIXME: Fails Asan, as expected, with 5lvl page tables. +// UNSUPPORTED: x86_64-target-arch + #include #include #include From 63310243537ba8830f3533a5d93e7b04b10d6c9e Mon Sep 17 00:00:00 2001 From: Argyrios Kyrtzidis Date: Mon, 15 Apr 2024 15:05:55 -0700 Subject: [PATCH 008/300] [clang/DependencyScanning/ModuleDepCollector] Refactor part of `makeCommonInvocationForModuleBuild` into its own function (#88447) The new function is about clearing out benign codegen options and can be applied for PCH invocations as well. --- .../DependencyScanning/ModuleDepCollector.h | 5 +++ .../DependencyScanning/ModuleDepCollector.cpp | 36 ++++++++++++------- clang/test/ClangScanDeps/removed-args.c | 28 +++++++++++++++ 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h index 081899cc2c8503..da51292296a90f 100644 --- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h +++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h @@ -308,6 +308,11 @@ class ModuleDepCollector final : public DependencyCollector { ModuleDeps &Deps); }; +/// Resets codegen options that don't affect modules/PCH. +void resetBenignCodeGenOptions(frontend::ActionKind ProgramAction, + const LangOptions &LangOpts, + CodeGenOptions &CGOpts); + } // end namespace dependencies } // end namespace tooling } // end namespace clang diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp index 94ccbd3351b09d..e19f19b2528c15 100644 --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp @@ -154,6 +154,26 @@ void ModuleDepCollector::addOutputPaths(CowCompilerInvocation &CI, } } +void dependencies::resetBenignCodeGenOptions(frontend::ActionKind ProgramAction, + const LangOptions &LangOpts, + CodeGenOptions &CGOpts) { + // TODO: Figure out better way to set options to their default value. + if (ProgramAction == frontend::GenerateModule) { + CGOpts.MainFileName.clear(); + CGOpts.DwarfDebugFlags.clear(); + } + if (ProgramAction == frontend::GeneratePCH || + (ProgramAction == frontend::GenerateModule && !LangOpts.ModulesCodegen)) { + CGOpts.DebugCompilationDir.clear(); + CGOpts.CoverageCompilationDir.clear(); + CGOpts.CoverageDataFile.clear(); + CGOpts.CoverageNotesFile.clear(); + CGOpts.ProfileInstrumentUsePath.clear(); + CGOpts.SampleProfileFile.clear(); + CGOpts.ProfileRemappingFile.clear(); + } +} + static CowCompilerInvocation makeCommonInvocationForModuleBuild(CompilerInvocation CI) { CI.resetNonModularOptions(); @@ -167,18 +187,8 @@ makeCommonInvocationForModuleBuild(CompilerInvocation CI) { // LLVM options are not going to affect the AST CI.getFrontendOpts().LLVMArgs.clear(); - // TODO: Figure out better way to set options to their default value. - CI.getCodeGenOpts().MainFileName.clear(); - CI.getCodeGenOpts().DwarfDebugFlags.clear(); - if (!CI.getLangOpts().ModulesCodegen) { - CI.getCodeGenOpts().DebugCompilationDir.clear(); - CI.getCodeGenOpts().CoverageCompilationDir.clear(); - CI.getCodeGenOpts().CoverageDataFile.clear(); - CI.getCodeGenOpts().CoverageNotesFile.clear(); - CI.getCodeGenOpts().ProfileInstrumentUsePath.clear(); - CI.getCodeGenOpts().SampleProfileFile.clear(); - CI.getCodeGenOpts().ProfileRemappingFile.clear(); - } + resetBenignCodeGenOptions(frontend::GenerateModule, CI.getLangOpts(), + CI.getCodeGenOpts()); // Map output paths that affect behaviour to "-" so their existence is in the // context hash. The final path will be computed in addOutputPaths. @@ -342,6 +352,8 @@ static bool needsModules(FrontendInputFile FIF) { void ModuleDepCollector::applyDiscoveredDependencies(CompilerInvocation &CI) { CI.clearImplicitModuleBuildOptions(); + resetBenignCodeGenOptions(CI.getFrontendOpts().ProgramAction, + CI.getLangOpts(), CI.getCodeGenOpts()); if (llvm::any_of(CI.getFrontendOpts().Inputs, needsModules)) { Preprocessor &PP = ScanInstance.getPreprocessor(); diff --git a/clang/test/ClangScanDeps/removed-args.c b/clang/test/ClangScanDeps/removed-args.c index f49e4ead82f7bf..3e108f0549450c 100644 --- a/clang/test/ClangScanDeps/removed-args.c +++ b/clang/test/ClangScanDeps/removed-args.c @@ -93,3 +93,31 @@ // CHECK-NOT: "-fmodules-prune-interval= // CHECK-NOT: "-fmodules-prune-after= // CHECK: ], + +// Check for removed args for PCH invocations. + +// RUN: split-file %s %t +// RUN: sed "s|DIR|%/t|g" %t/cdb-pch.json.template > %t/cdb-pch.json +// RUN: clang-scan-deps -compilation-database %t/cdb-pch.json -format experimental-full > %t/result-pch.json +// RUN: cat %t/result-pch.json | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t -check-prefix=PCH +// +// PCH-NOT: "-fdebug-compilation-dir=" +// PCH-NOT: "-fcoverage-compilation-dir=" +// PCH-NOT: "-coverage-notes-file +// PCH-NOT: "-coverage-data-file +// PCH-NOT: "-fprofile-instrument-use-path +// PCH-NOT: "-include" +// PCH-NOT: "-fmodules-cache-path= +// PCH-NOT: "-fmodules-validate-once-per-build-session" +// PCH-NOT: "-fbuild-session-timestamp= +// PCH-NOT: "-fmodules-prune-interval= +// PCH-NOT: "-fmodules-prune-after= + +//--- cdb-pch.json.template +[ + { + "directory": "DIR", + "command": "clang -x c-header DIR/header.h -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache -fdebug-compilation-dir=DIR/debug -fcoverage-compilation-dir=DIR/coverage -ftest-coverage -fprofile-instr-use=DIR/tu.profdata -o DIR/header.h.pch -serialize-diagnostics DIR/header.h.pch.diag ", + "file": "DIR/header.h.pch" + } +] From 6d234638f90bd422078c93745bdee73d6de201bf Mon Sep 17 00:00:00 2001 From: Fr4nk1in Date: Tue, 16 Apr 2024 06:13:38 +0800 Subject: [PATCH 009/300] [docs][mlir] Fix broken links in 'llvm' dialects. (#88704) Links to `llvm.mlir.global` and `llvm.mlir.addressof` in the ["Globals" section of LLVM dialect documentation](https://mlir.llvm.org/docs/Dialects/LLVM/#globals) are broken. --- mlir/docs/Dialects/LLVM.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md index a49ba35db9a68f..ba466aa6bc4012 100644 --- a/mlir/docs/Dialects/LLVM.md +++ b/mlir/docs/Dialects/LLVM.md @@ -139,12 +139,12 @@ will be reevaluated after considering composite constants. ### Globals Global variables are also defined using a special operation, -[`llvm.mlir.global`](#llvmmlirglobal-mlirllvmglobalop), located at the module +[`llvm.mlir.global`](#llvmmlirglobal-llvmglobalop), located at the module level. Globals are MLIR symbols and are identified by their name. Since functions need to be isolated-from-above, i.e. values defined outside the function cannot be directly used inside the function, an additional operation, -[`llvm.mlir.addressof`](#llvmmliraddressof-mlirllvmaddressofop), is provided to +[`llvm.mlir.addressof`](#llvmmliraddressof-llvmaddressofop), is provided to locally define a value containing the _address_ of a global. The actual value can then be loaded from that pointer, or a new value can be stored into it if the global is not declared constant. This is similar to LLVM IR where globals From 206acf72c3b6c23e77716ccfc55ff94a4e7a7e3e Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 15 Apr 2024 18:48:49 -0400 Subject: [PATCH 010/300] [gn] port 8a7846fe86f95 (C++23 for libcxx, libcxxabi) --- llvm/utils/gn/secondary/libcxx/src/BUILD.gn | 2 +- llvm/utils/gn/secondary/libcxxabi/src/BUILD.gn | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn index 5da8db4574a0cc..1f6879358f22bc 100644 --- a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn @@ -49,7 +49,7 @@ config("cxx_config") { "-Wno-covered-switch-default", ] cflags_cc = [ - "-std=c++20", + "-std=c++23", "-nostdinc++", ] defines = [ diff --git a/llvm/utils/gn/secondary/libcxxabi/src/BUILD.gn b/llvm/utils/gn/secondary/libcxxabi/src/BUILD.gn index c82634e2bb0648..7a923c5c854d7e 100644 --- a/llvm/utils/gn/secondary/libcxxabi/src/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxxabi/src/BUILD.gn @@ -66,7 +66,7 @@ config("cxxabi_config") { "//libcxx/src", ] cflags_cc = [ - "-std=c++20", + "-std=c++23", "-nostdinc++", ] defines = [ From 466017c8dab74f66ce513c8752f0c1dcd16a8a63 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Mon, 15 Apr 2024 15:50:07 -0700 Subject: [PATCH 011/300] Work around test failure due to new aslr default --- .../TestDiagnoseDereferenceFunctionReturn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py b/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py index d8f45161378b0f..4d9b036f5102cb 100644 --- a/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py +++ b/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py @@ -19,6 +19,9 @@ def test_diagnose_dereference_function_return(self): TestBase.setUp(self) self.build() exe = self.getBuildArtifact("a.out") + # FIXME: This default changed in lldbtest.py and this test + # seems to rely on having it turned off. + self.runCmd("settings set target.disable-aslr true") self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) self.runCmd("run", RUN_SUCCEEDED) self.expect("thread list", "Thread should be stopped", substrs=["stopped"]) From 9cb755cf5b2f48117fa34f257f386acf59ad6397 Mon Sep 17 00:00:00 2001 From: Wu Yingcong Date: Mon, 15 Apr 2024 16:37:35 -0700 Subject: [PATCH 012/300] [Test][JITLink] Save rbx in ExecutionEngine/JITLink/x86-64/ELF_vtune.s (#86472) The callee should preserve rbx according to the calling convention, but it is not in the test case `ExecutionEngine/JITLink/x86-64/ELF_vtune.s`. Not preserving the rbx register may result in some random error to the caller function. This patch adds the missing command to preserve the rbx. --- llvm/test/ExecutionEngine/JITLink/x86-64/ELF_vtune.s | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_vtune.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_vtune.s index 1c95bde51e1211..936486b8a319cc 100644 --- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_vtune.s +++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_vtune.s @@ -20,12 +20,14 @@ main: .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp + pushq %rbx .cfi_def_cfa_register 6 - movl %edi, -4(%rbp) - movq %rsi, -16(%rbp) - movl -4(%rbp), %ebx + movl %edi, -16(%rbp) + movq %rsi, -24(%rbp) + movl -16(%rbp), %ebx addl $1, %ebx - movl $0, %eax + movl $0, %eax + popq %rbx popq %rbp .cfi_def_cfa 7, 8 ret From c50f7e9a425bfa4ab8655c79a715c88ed3b1e830 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 16 Apr 2024 09:17:52 +0900 Subject: [PATCH 013/300] [InstCombine] Remove mul of SPF abs fold (#88675) Remove the fold working on abs in SPF representation now that we canonicalize SPF to intrinsics. This is not strictly NFC because the SPF fold might fire for non-canonical IR due to multi-use, but given the lack of test coverage, I assume this is not important. --- .../InstCombine/InstCombineMulDivRem.cpp | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 4dc1319f1c437f..48372381a0d1cd 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -319,19 +319,12 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) { } // abs(X) * abs(X) -> X * X - // nabs(X) * nabs(X) -> X * X - if (Op0 == Op1) { - Value *X, *Y; - SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor; - if (SPF == SPF_ABS || SPF == SPF_NABS) - return BinaryOperator::CreateMul(X, X); - - if (match(Op0, m_Intrinsic(m_Value(X)))) - return BinaryOperator::CreateMul(X, X); - } + Value *X; + if (Op0 == Op1 && match(Op0, m_Intrinsic(m_Value(X)))) + return BinaryOperator::CreateMul(X, X); { - Value *X, *Y; + Value *Y; // abs(X) * abs(Y) -> abs(X * Y) if (I.hasNoSignedWrap() && match(Op0, @@ -344,7 +337,7 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) { } // -X * C --> X * -C - Value *X, *Y; + Value *Y; Constant *Op1C; if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C))) return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C)); From 2b06ff555aa32b316710b4708fbc16f36d6eab15 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Mon, 15 Apr 2024 17:38:39 -0700 Subject: [PATCH 014/300] [RISCV] Expand mul to shNadd x, (slli x, c) in DAGCombine (#88524) This expansion is directly inspired by the analogous code in the x86 backend for LEA. shXadd and (this sub-case of) LEA are largely equivalent. This is an alternative to https://github.com/llvm/llvm-project/pull/87105. This expansion is also supported via the decomposeMulByConstant callback, but restricted because of interactions with other combines since that code runs before legalization. As discussed in the other review, my original plan had been to support post legalization expansion through the same interface, but that ended up being more complicated than seems justified. Instead, lets go ahead and do the general expansion post-legalize. Other targets use the combine approach, and matching that structure makes it easier for us to adapt ideas from other targets to RISCV. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 52 +++++++++++++++++-- llvm/test/CodeGen/RISCV/addimm-mulimm.ll | 9 ++-- llvm/test/CodeGen/RISCV/rv32zba.ll | 48 +++++++++++------ .../CodeGen/RISCV/rv64-legal-i32/rv64zba.ll | 48 +++++++++++------ llvm/test/CodeGen/RISCV/rv64zba.ll | 48 +++++++++++------ 5 files changed, 153 insertions(+), 52 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 27387595164a46..259cc388276c69 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13363,10 +13363,56 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG, return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget); } -static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG) { +// Try to expand a scalar multiply to a faster sequence. +static SDValue expandMul(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const RISCVSubtarget &Subtarget) { + EVT VT = N->getValueType(0); - if (!VT.isVector()) + + // LI + MUL is usually smaller than the alternative sequence. + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return SDValue(); + + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + if (VT != Subtarget.getXLenVT()) + return SDValue(); + + if (!Subtarget.hasStdExtZba()) + return SDValue(); + + ConstantSDNode *CNode = dyn_cast(N->getOperand(1)); + if (!CNode) return SDValue(); + uint64_t MulAmt = CNode->getZExtValue(); + + // If this is a power 2 + 2/4/8, we can use a shift followed by a single + // shXadd. First check if this a sum of two power of 2s because that's + // easy. Then count how many zeros are up to the first bit. + if (isPowerOf2_64(MulAmt & (MulAmt - 1))) { + unsigned ScaleShift = llvm::countr_zero(MulAmt); + if (ScaleShift >= 1 && ScaleShift < 4) { + unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1))); + SDLoc DL(N); + SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(ShiftAmt, DL, VT)); + SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(ScaleShift, DL, VT)); + return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2); + } + } + return SDValue(); +} + + +static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const RISCVSubtarget &Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return expandMul(N, DAG, DCI, Subtarget); SDLoc DL(N); SDValue N0 = N->getOperand(0); @@ -15913,7 +15959,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case ISD::MUL: if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) return V; - return performMULCombine(N, DAG); + return performMULCombine(N, DAG, DCI, Subtarget); case ISD::SDIV: case ISD::UDIV: case ISD::SREM: diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll index 48fa69e1045656..10103f071462c5 100644 --- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll +++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll @@ -551,8 +551,9 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) { ; RV32IMB-NEXT: sh3add a1, a1, a2 ; RV32IMB-NEXT: sh1add a0, a0, a0 ; RV32IMB-NEXT: slli a2, a0, 3 -; RV32IMB-NEXT: addi a0, a2, 2047 -; RV32IMB-NEXT: addi a0, a0, 1 +; RV32IMB-NEXT: li a3, 1 +; RV32IMB-NEXT: slli a3, a3, 11 +; RV32IMB-NEXT: sh3add a0, a0, a3 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret @@ -561,8 +562,8 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) { ; RV64IMB: # %bb.0: ; RV64IMB-NEXT: addi a0, a0, 86 ; RV64IMB-NEXT: sh1add a0, a0, a0 -; RV64IMB-NEXT: li a1, -16 -; RV64IMB-NEXT: sh3add a0, a0, a1 +; RV64IMB-NEXT: slli a0, a0, 3 +; RV64IMB-NEXT: addi a0, a0, -16 ; RV64IMB-NEXT: ret %tmp0 = mul i64 %x, 24 %tmp1 = add i64 %tmp0, 2048 diff --git a/llvm/test/CodeGen/RISCV/rv32zba.ll b/llvm/test/CodeGen/RISCV/rv32zba.ll index 0908a393338c50..cc632a09c8054b 100644 --- a/llvm/test/CodeGen/RISCV/rv32zba.ll +++ b/llvm/test/CodeGen/RISCV/rv32zba.ll @@ -271,31 +271,49 @@ define i32 @mul288(i32 %a) { } define i32 @mul258(i32 %a) { -; CHECK-LABEL: mul258: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 258 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV32I-LABEL: mul258: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 258 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBA-LABEL: mul258: +; RV32ZBA: # %bb.0: +; RV32ZBA-NEXT: slli a1, a0, 8 +; RV32ZBA-NEXT: sh1add a0, a0, a1 +; RV32ZBA-NEXT: ret %c = mul i32 %a, 258 ret i32 %c } define i32 @mul260(i32 %a) { -; CHECK-LABEL: mul260: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 260 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV32I-LABEL: mul260: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 260 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBA-LABEL: mul260: +; RV32ZBA: # %bb.0: +; RV32ZBA-NEXT: slli a1, a0, 8 +; RV32ZBA-NEXT: sh2add a0, a0, a1 +; RV32ZBA-NEXT: ret %c = mul i32 %a, 260 ret i32 %c } define i32 @mul264(i32 %a) { -; CHECK-LABEL: mul264: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 264 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV32I-LABEL: mul264: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 264 +; RV32I-NEXT: mul a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBA-LABEL: mul264: +; RV32ZBA: # %bb.0: +; RV32ZBA-NEXT: slli a1, a0, 8 +; RV32ZBA-NEXT: sh3add a0, a0, a1 +; RV32ZBA-NEXT: ret %c = mul i32 %a, 264 ret i32 %c } diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll index 90cfb1fdcb779f..ee9b73ca82f213 100644 --- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll @@ -811,31 +811,49 @@ define i64 @adduw_imm(i32 signext %0) nounwind { } define i64 @mul258(i64 %a) { -; CHECK-LABEL: mul258: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 258 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: mul258: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 258 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul258: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli a1, a0, 8 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret %c = mul i64 %a, 258 ret i64 %c } define i64 @mul260(i64 %a) { -; CHECK-LABEL: mul260: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 260 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: mul260: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 260 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul260: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli a1, a0, 8 +; RV64ZBA-NEXT: sh2add a0, a0, a1 +; RV64ZBA-NEXT: ret %c = mul i64 %a, 260 ret i64 %c } define i64 @mul264(i64 %a) { -; CHECK-LABEL: mul264: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 264 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: mul264: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 264 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul264: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli a1, a0, 8 +; RV64ZBA-NEXT: sh3add a0, a0, a1 +; RV64ZBA-NEXT: ret %c = mul i64 %a, 264 ret i64 %c } diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index bb4be323ecb2e5..0d1d4838c61133 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -834,31 +834,49 @@ define i64 @adduw_imm(i32 signext %0) nounwind { } define i64 @mul258(i64 %a) { -; CHECK-LABEL: mul258: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 258 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: mul258: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 258 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul258: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli a1, a0, 8 +; RV64ZBA-NEXT: sh1add a0, a0, a1 +; RV64ZBA-NEXT: ret %c = mul i64 %a, 258 ret i64 %c } define i64 @mul260(i64 %a) { -; CHECK-LABEL: mul260: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 260 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: mul260: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 260 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul260: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli a1, a0, 8 +; RV64ZBA-NEXT: sh2add a0, a0, a1 +; RV64ZBA-NEXT: ret %c = mul i64 %a, 260 ret i64 %c } define i64 @mul264(i64 %a) { -; CHECK-LABEL: mul264: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 264 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: mul264: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 264 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul264: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: slli a1, a0, 8 +; RV64ZBA-NEXT: sh3add a0, a0, a1 +; RV64ZBA-NEXT: ret %c = mul i64 %a, 264 ret i64 %c } From 40bbdb609f58d6cbbae1ca525832d7a21641a347 Mon Sep 17 00:00:00 2001 From: Alina Sbirlea Date: Mon, 15 Apr 2024 17:36:15 -0700 Subject: [PATCH 015/300] Revert "[DAG] Fold extract_subvector(insert_subvector(x,y,c1),c2) --> extract_subvector(y,c2-c1) (#87925)" This reverts commit 8c0f52e9d5a99bf96bb64ac23b5893482c292527. Reverting to green, reproducer attached in the PR/revision comments. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 17 - .../any_extend_vector_inreg_of_broadcast.ll | 42 +- llvm/test/CodeGen/X86/dpbusd.ll | 2 +- llvm/test/CodeGen/X86/dpbusd_i4.ll | 2 +- .../vector-interleaved-load-i16-stride-3.ll | 1300 +++++++++-------- .../vector-interleaved-store-i8-stride-7.ll | 602 ++++---- .../zero_extend_vector_inreg_of_broadcast.ll | 28 +- 7 files changed, 1006 insertions(+), 987 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index cbba3a294b3d68..0fa0bf2609bb31 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24467,23 +24467,6 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { if (!LegalOperations || TLI.isOperationLegal(ISD::SPLAT_VECTOR, NVT)) return DAG.getSplatVector(NVT, DL, V.getOperand(0)); - // extract_subvector(insert_subvector(x,y,c1),c2) - // --> extract_subvector(y,c2-c1) - // iff we're just extracting from the inserted subvector. - if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { - SDValue InsSub = V.getOperand(1); - EVT InsSubVT = InsSub.getValueType(); - unsigned NumInsElts = InsSubVT.getVectorMinNumElements(); - unsigned InsIdx = V.getConstantOperandVal(2); - unsigned NumSubElts = NVT.getVectorMinNumElements(); - if (InsIdx <= ExtIdx && (ExtIdx + NumSubElts) <= (InsIdx + NumInsElts) && - TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx)) { - SDLoc DL(N); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, InsSub, - DAG.getVectorIdxConstant(ExtIdx - InsIdx, DL)); - } - } - // Try to move vector bitcast after extract_subv by scaling extraction index: // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') if (V.getOpcode() == ISD::BITCAST && diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 39c7ce1413d1b3..4242d8483e7233 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -314,8 +314,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -324,8 +324,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -981,7 +981,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -992,7 +992,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -3507,12 +3507,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3522,12 +3523,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3766,10 +3768,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -3782,10 +3784,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -4145,9 +4147,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -4159,9 +4161,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll index 04d7a9691b645f..fbea08eb1e5502 100644 --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -26,7 +26,7 @@ define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll index a212f99680ef4d..906fead7f8db53 100644 --- a/llvm/test/CodeGen/X86/dpbusd_i4.ll +++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll @@ -86,7 +86,7 @@ define i32 @mul_sext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-NEXT: vpsraw $12, %ymm0, %ymm0 ; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 6d5fc9ed0ab5b6..1436922f9dd114 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -1828,22 +1828,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i16_stride3_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512-NEXT: vmovdqa %ymm1, %ymm3 ; AVX512-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -1857,14 +1857,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512-NEXT: vmovdqa %ymm1, %ymm10 ; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -1885,19 +1885,21 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] +; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1906,22 +1908,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride3_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -1935,14 +1937,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm10 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -1963,19 +1965,21 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1984,22 +1988,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-LABEL: load_i16_stride3_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm3 ; AVX512DQ-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -2013,14 +2017,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm10 ; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -2041,19 +2045,21 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -2062,22 +2068,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm3 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -2091,14 +2097,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm10 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -2119,19 +2125,21 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3492,668 +3500,688 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-LABEL: load_i16_stride3_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm20 +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm21 ; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm5 -; AVX512-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm5 +; AVX512-NEXT: vmovdqa 272(%rdi), %xmm8 ; AVX512-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512-NEXT: vmovdqa %xmm2, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX512-NEXT: vmovdqa %xmm2, %xmm14 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512-NEXT: vpshufb %xmm9, %xmm6, %xmm6 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm21 -; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm22 +; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm23 +; AVX512-NEXT: vmovdqa %ymm0, %ymm6 +; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm12 ; AVX512-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512-NEXT: vmovdqa %xmm1, %xmm8 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 -; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa %xmm1, %xmm6 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm24 +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 +; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm10 -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] -; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm11 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] +; AVX512-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] -; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 +; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] +; AVX512-NEXT: vpshufb %ymm3, %ymm10, %ymm2 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] -; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 -; AVX512-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] -; AVX512-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] +; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] +; AVX512-NEXT: vmovdqa64 %xmm6, %xmm25 ; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa %xmm14, %xmm7 +; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] +; AVX512-NEXT: vmovdqa64 %xmm8, %xmm27 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 -; AVX512-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] -; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] -; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm13, %ymm6 -; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] -; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX512-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] +; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] +; AVX512-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 +; AVX512-NEXT: vextracti32x4 $2, %zmm5, %xmm5 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] +; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] ; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride3_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm5 +; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm8 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm14 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22 +; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm23 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm12 ; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm11 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm2 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm25 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm27 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm6 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] +; AVX512-FCP-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm5, %xmm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] +; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm4, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride3_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm20 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm21 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm5 -; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm5 +; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm8 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm14 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm6, %xmm6 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm21 -; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm22 +; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm23 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm6 +; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm12 ; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm6 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm24 +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 +; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm10 -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm11 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512DQ-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm10, %ymm2 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm25 ; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa %xmm14, %xmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm27 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm6 -; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] +; AVX512DQ-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm5, %xmm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] +; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm4, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm14 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm27 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm14 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 8091afbbfd70c3..8b6ba51506ab79 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1246,28 +1246,29 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] +; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero +; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] ; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] -; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-FCP-NEXT: vmovq %xmm2, 48(%rax) -; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) -; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) +; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512BW-FCP-NEXT: vmovq %xmm0, 48(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -1325,28 +1326,29 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, 48(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 48(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 @@ -2051,76 +2053,77 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512-NEXT: vmovdqa (%r8), %xmm3 -; AVX512-NEXT: vmovdqa (%r9), %xmm4 -; AVX512-NEXT: vmovdqa (%r10), %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512-NEXT: vpternlogq $50, %ymm10, %ymm12, %ymm11 -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] -; AVX512-NEXT: vpternlogq $200, %ymm11, %ymm12, %ymm13 -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 -; AVX512-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512-NEXT: vpternlogq $200, %ymm11, %ymm13, %ymm12 -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX512-NEXT: vpandn %ymm12, %ymm13, %ymm12 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512-NEXT: vporq %zmm12, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] -; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512-NEXT: vmovdqa (%r8), %xmm0 +; AVX512-NEXT: vmovdqa (%r10), %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm3 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 +; AVX512-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0 +; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[12,13,u,u,u],zero,zero,xmm6[14,15,u,u,u] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,12,13],zero,zero,xmm4[u,u,u,14,15],zero,zero,xmm4[u,u,u] +; AVX512-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] +; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,ymm4[u,u,u,10,2],zero,zero,ymm4[u,u,u,11,3],zero,zero,ymm4[u,u,u,20,28],zero,zero,ymm4[u,u,u,21,29],zero,zero,ymm4[u] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,3,3,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] +; AVX512-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm7 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 -; AVX512-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 +; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 +; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm4 +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512-NEXT: vpternlogq $50, %ymm6, %ymm8, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u,u],zero +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25] +; AVX512-NEXT: vpternlogq $200, %ymm6, %ymm8, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10],zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512-NEXT: vpandn %ymm3, %ymm6, %ymm3 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,ymm0[u,u,u,u,u,5],zero,ymm0[u,u,u,u,u,6],zero,ymm0[u,u,u,u,u],zero,ymm0[23,u,u,u,u,u],zero,ymm0[24,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4,u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u] +; AVX512-NEXT: vpternlogq $200, %ymm3, %ymm6, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512-NEXT: vmovdqa %xmm5, 96(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -2128,69 +2131,70 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm11 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] -; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21] -; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm9 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 +; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1],zero,zero,ymm2[u,u,u,10,2],zero,zero,ymm2[u,u,u,11,3],zero,zero,ymm2[u,u,u,20,28],zero,zero,ymm2[u,u,u,21,29],zero,zero,ymm2[u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u] +; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 +; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u],zero,zero,ymm6[1,5,u,u,u],zero,zero,ymm6[2,6,u,u,u],zero,zero,ymm6[19,23,u,u,u],zero,zero,ymm6[24,28,u,u,u],zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero +; AVX512-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,0,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 +; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -2198,76 +2202,77 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq $50, %ymm10, %ymm12, %ymm11 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] -; AVX512DQ-NEXT: vpternlogq $200, %ymm11, %ymm12, %ymm13 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 -; AVX512DQ-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512DQ-NEXT: vpternlogq $200, %ymm11, %ymm13, %ymm12 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] -; AVX512DQ-NEXT: vpandn %ymm12, %ymm13, %ymm12 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512DQ-NEXT: vporq %zmm12, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] -; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm6 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 +; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[12,13,u,u,u],zero,zero,xmm6[14,15,u,u,u] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,12,13],zero,zero,xmm4[u,u,u,14,15],zero,zero,xmm4[u,u,u] +; AVX512DQ-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] +; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,ymm4[u,u,u,10,2],zero,zero,ymm4[u,u,u,11,3],zero,zero,ymm4[u,u,u,20,28],zero,zero,ymm4[u,u,u,21,29],zero,zero,ymm4[u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,3,3,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] +; AVX512DQ-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm7 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 -; AVX512DQ-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 +; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq $50, %ymm6, %ymm8, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u,u],zero +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25] +; AVX512DQ-NEXT: vpternlogq $200, %ymm6, %ymm8, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10],zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512DQ-NEXT: vporq %zmm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512DQ-NEXT: vpandn %ymm3, %ymm6, %ymm3 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,ymm0[u,u,u,u,u,5],zero,ymm0[u,u,u,u,u,6],zero,ymm0[u,u,u,u,u],zero,ymm0[23,u,u,u,u,u],zero,ymm0[24,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4,u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u] +; AVX512DQ-NEXT: vpternlogq $200, %ymm3, %ymm6, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 +; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqa %xmm5, 96(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-NEXT: vmovdqa %ymm4, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2275,69 +2280,70 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm11 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] -; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21] -; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm9 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1],zero,zero,ymm2[u,u,u,10,2],zero,zero,ymm2[u,u,u,11,3],zero,zero,ymm2[u,u,u,20,28],zero,zero,ymm2[u,u,u,21,29],zero,zero,ymm2[u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u] +; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u],zero,zero,ymm6[1,5,u,u,u],zero,zero,ymm6[2,6,u,u,u],zero,zero,ymm6[19,23,u,u,u],zero,zero,ymm6[24,28,u,u,u],zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero +; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 99e8cdb179c8dc..11f422d671541a 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -314,8 +314,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -324,8 +324,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -981,7 +981,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -992,7 +992,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4026,10 +4026,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -4062,10 +4062,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -4541,9 +4541,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -4559,9 +4559,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 From 694c444b5bbb56dcba8978d283fe5385237c309a Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 15 Apr 2024 20:58:08 -0400 Subject: [PATCH 016/300] [gn] port 311ff3917827 more --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 865a79b63cd848..ee44558a4e9947 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -982,6 +982,7 @@ if (current_toolchain == default_toolchain) { "ctgmath", "ctime", "ctype.h", + "cuchar", "cwchar", "cwctype", "deque", From 2ac562ab784c6bf04e6d3026c567d3552a735668 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 15 Apr 2024 17:58:39 -0700 Subject: [PATCH 017/300] [Sema] Mark alias/ifunc targets used and consider mangled names https://reviews.llvm.org/D54188 marked "alias" targets as used in C to fix -Wunused false positives. This patch extends the approach to handle mangled names to support global scope names in C++ and the `overloadable` attribute in C. In addition, we mark ifunc targets as used to fix #63957. While our approach has false negatives for namespace scope names, the majority of alias/ifunc C++ uses (global scope with no overloads) are handled. Note: The following function with internal linkage but C language linkage type is mangled in Clang but not in GCC. This inconsistency makes alias/ifunc difficult to use in C++ with portability (#88593). ``` extern "C" { static void f0() {} // GCC: void g0() __attribute__((alias("_ZL2f0v"))); // Clang: void g0() __attribute__((alias("f0"))); } ``` Pull Request: https://github.com/llvm/llvm-project/pull/87130 --- clang/lib/Sema/CMakeLists.txt | 1 + clang/lib/Sema/SemaDeclAttr.cpp | 44 ++++++++++++++----- clang/test/AST/ast-dump-attr-json.cpp | 1 + clang/test/Sema/alias-unused-win.cpp | 2 +- clang/test/Sema/alias-unused.cpp | 16 ++++--- .../llvm-project-overlay/clang/BUILD.bazel | 1 + 6 files changed, 46 insertions(+), 19 deletions(-) diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt index ab3b813a9ccd97..a96439df664228 100644 --- a/clang/lib/Sema/CMakeLists.txt +++ b/clang/lib/Sema/CMakeLists.txt @@ -1,5 +1,6 @@ set(LLVM_LINK_COMPONENTS Core + Demangle FrontendHLSL FrontendOpenMP MC diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index b7b1fbc625a150..d26f130b5774ce 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -45,6 +45,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/STLForwardCompat.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Demangle/Demangle.h" #include "llvm/IR/Assumptions.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Support/Error.h" @@ -1983,6 +1984,36 @@ static void handleWeakRefAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(::new (S.Context) WeakRefAttr(S.Context, AL)); } +// Mark alias/ifunc target as used. Due to name mangling, we look up the +// demangled name ignoring parameters (not supported by microsoftDemangle +// https://github.com/llvm/llvm-project/issues/88825). This should handle the +// majority of use cases while leaving namespace scope names unmarked. +static void markUsedForAliasOrIfunc(Sema &S, Decl *D, const ParsedAttr &AL, + StringRef Str) { + std::unique_ptr Demangled; + if (S.getASTContext().getCXXABIKind() != TargetCXXABI::Microsoft) + Demangled.reset(llvm::itaniumDemangle(Str, /*ParseParams=*/false)); + std::unique_ptr MC(S.Context.createMangleContext()); + SmallString<256> Name; + + const DeclarationNameInfo Target( + &S.Context.Idents.get(Demangled ? Demangled.get() : Str), AL.getLoc()); + LookupResult LR(S, Target, Sema::LookupOrdinaryName); + if (S.LookupName(LR, S.TUScope)) { + for (NamedDecl *ND : LR) { + if (MC->shouldMangleDeclName(ND)) { + llvm::raw_svector_ostream Out(Name); + Name.clear(); + MC->mangleName(GlobalDecl(ND), Out); + } else { + Name = ND->getIdentifier()->getName(); + } + if (Name == Str) + ND->markUsed(S.Context); + } + } +} + static void handleIFuncAttr(Sema &S, Decl *D, const ParsedAttr &AL) { StringRef Str; if (!S.checkStringLiteralArgumentAttr(AL, 0, Str)) @@ -1995,6 +2026,7 @@ static void handleIFuncAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } + markUsedForAliasOrIfunc(S, D, AL, Str); D->addAttr(::new (S.Context) IFuncAttr(S.Context, AL, Str)); } @@ -2029,17 +2061,7 @@ static void handleAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { } } - // Mark target used to prevent unneeded-internal-declaration warnings. - if (!S.LangOpts.CPlusPlus) { - // FIXME: demangle Str for C++, as the attribute refers to the mangled - // linkage name, not the pre-mangled identifier. - const DeclarationNameInfo target(&S.Context.Idents.get(Str), AL.getLoc()); - LookupResult LR(S, target, Sema::LookupOrdinaryName); - if (S.LookupQualifiedName(LR, S.getCurLexicalContext())) - for (NamedDecl *ND : LR) - ND->markUsed(S.Context); - } - + markUsedForAliasOrIfunc(S, D, AL, Str); D->addAttr(::new (S.Context) AliasAttr(S.Context, AL, Str)); } diff --git a/clang/test/AST/ast-dump-attr-json.cpp b/clang/test/AST/ast-dump-attr-json.cpp index 051c2956abfdf7..883e584bfedf07 100644 --- a/clang/test/AST/ast-dump-attr-json.cpp +++ b/clang/test/AST/ast-dump-attr-json.cpp @@ -46,6 +46,7 @@ __thread __attribute__ ((tls_model ("local-exec"))) int tls_model_var; // CHECK-NEXT: "tokLen": 11 // CHECK-NEXT: } // CHECK-NEXT: }, +// CHECK-NEXT: "isUsed": true, // CHECK-NEXT: "name": "global_decl", // CHECK-NEXT: "mangledName": "global_decl", // CHECK-NEXT: "type": { diff --git a/clang/test/Sema/alias-unused-win.cpp b/clang/test/Sema/alias-unused-win.cpp index 47c96d41175179..97d57a3bbd1e31 100644 --- a/clang/test/Sema/alias-unused-win.cpp +++ b/clang/test/Sema/alias-unused-win.cpp @@ -7,7 +7,7 @@ extern "C" { static int f(void) { return 42; } // cxx-warning{{unused function 'f'}} int g(void) __attribute__((alias("f"))); -static int foo [] = { 42, 0xDEAD }; // cxx-warning{{variable 'foo' is not needed and will not be emitted}} +static int foo [] = { 42, 0xDEAD }; extern typeof(foo) bar __attribute__((unused, alias("foo"))); static int __attribute__((overloadable)) f0(int x) { return x; } // expected-warning{{unused function 'f0'}} diff --git a/clang/test/Sema/alias-unused.cpp b/clang/test/Sema/alias-unused.cpp index dc8e46f072d74d..c0b541c880e525 100644 --- a/clang/test/Sema/alias-unused.cpp +++ b/clang/test/Sema/alias-unused.cpp @@ -14,24 +14,26 @@ extern typeof(foo) bar __attribute__((unused, alias("foo"))); /// We report a warning in C++ mode because the internal linkage `resolver` gets /// mangled as it does not have a language linkage. GCC does not mangle /// `resolver` or report a warning. -static int (*resolver(void))(void) { return f; } // expected-warning{{unused function 'resolver'}} +static int (*resolver(void))(void) { return f; } // cxx-warning{{unused function 'resolver'}} int ifunc(void) __attribute__((ifunc("resolver"))); -static int __attribute__((overloadable)) f0(int x) { return x; } // expected-warning{{unused function 'f0'}} +static int __attribute__((overloadable)) f0(int x) { return x; } static float __attribute__((overloadable)) f0(float x) { return x; } // expected-warning{{unused function 'f0'}} int g0(void) __attribute__((alias("_ZL2f0i"))); #ifdef __cplusplus -static int f1() { return 42; } // expected-warning{{unused function 'f1'}} +static int f1() { return 42; } int g1(void) __attribute__((alias("_ZL2f1v"))); } -static int f2(int) { return 42; } // expected-warning{{unused function 'f2'}} -static int f2() { return 42; } // expected-warning{{unused function 'f2'}} +/// We demangle alias/ifunc target and mark all found functions as used. + +static int f2(int) { return 42; } // cxx-warning{{unused function 'f2'}} +static int f2() { return 42; } int g2() __attribute__((alias("_ZL2f2v"))); -static int (*resolver1())() { return f; } // expected-warning{{unused function 'resolver1'}} -static int (*resolver1(int))() { return f; } // expected-warning{{unused function 'resolver1'}} +static int (*resolver1())() { return f; } // cxx-warning{{unused function 'resolver1'}} +static int (*resolver1(int))() { return f; } int ifunc1() __attribute__((ifunc("_ZL9resolver1i"))); /// TODO: We should report "unused function" for f3(int). diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index c2f77e3abca0e6..725ac6bb38120b 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1136,6 +1136,7 @@ cc_library( "//llvm:AllTargetsAsmParsers", "//llvm:AllTargetsCodeGens", "//llvm:Core", + "//llvm:Demangle", "//llvm:FrontendHLSL", "//llvm:FrontendOpenMP", "//llvm:MC", From 8aa7e378dee27ec81959ef6750a7dd07cefdc77d Mon Sep 17 00:00:00 2001 From: Allen Date: Tue, 16 Apr 2024 09:00:03 +0800 Subject: [PATCH 018/300] [InterleavedAccessPass] Get round the unsupported large scalarize vectors (#88643) When build with option -msve-vector-bits=512, the return vaule of Subtarget->getMinSVEVectorSizeInBits() is 512; While the MinElts is still 4 for in getNumInterleavedAccesses, so it creates invalid llvm.aarch64.sve.ld2.sret.nxv4f64, which need be splited. Unlikely, the related custom spilting is not supported now. Fix https://github.com/llvm/llvm-project/issues/88247 --- .../Target/AArch64/AArch64ISelLowering.cpp | 2 +- .../AArch64/sve-interleaved-accesses.ll | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index eee67a0f823c19..7947d73f9a4dd0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15888,7 +15888,7 @@ unsigned AArch64TargetLowering::getNumInterleavedAccesses( unsigned VecSize = 128; unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); unsigned MinElts = VecTy->getElementCount().getKnownMinValue(); - if (UseScalable) + if (UseScalable && isa(VecTy)) VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u); return std::max(1, (MinElts * ElSize + 127) / VecSize); } diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll index feb22aa1a37635..45e2c36836ffc5 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll @@ -491,6 +491,30 @@ define void @store_bfloat_factor2(ptr %ptr, <16 x bfloat> %v0, <16 x bfloat> %v1 ret void } +; Ensure vscale_range property does not affect scalable vector types. +define { , } @deinterleave_nxptr_factor2(ptr %ptr) #2 { +; CHECK-LABEL: define { , } @deinterleave_nxptr_factor2( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[PTR]], i64 0 +; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN1]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr , ptr [[PTR]], i64 2 +; CHECK-NEXT: [[LDN2:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[LDN2]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP7]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN2]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP5]], [[TMP9]], i64 2) +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , } poison, [[TMP8]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , } [[TMP11]], [[TMP10]], 1 +; CHECK-NEXT: ret { , } [[TMP12]] +; + %wide.vec = load , ptr %ptr, align 8 + %ldN = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( %wide.vec) + ret { , } %ldN +} + attributes #0 = { vscale_range(2,2) "target-features"="+sve" } attributes #1 = { vscale_range(2,4) "target-features"="+sve" } attributes #2 = { vscale_range(4,4) "target-features"="+sve" } From d23a85066b5d89a47fde7b6777e0fc5a711b99d4 Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Mon, 15 Apr 2024 18:19:06 -0700 Subject: [PATCH 019/300] InstCombine: Increase threadlocal.address alignment if pointee is more aligned (#88435) Increase alignment of `llvm.threadlocal.address` if the pointed to global has higher alignment. --- .../InstCombine/InstCombineCalls.cpp | 9 ++++ .../InstCombine/threadlocal_address.ll | 41 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/threadlocal_address.ll diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 20f51c8af617de..bae8579fc3650b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3408,6 +3408,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { return I; break; } + case Intrinsic::threadlocal_address: { + Align MinAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT); + MaybeAlign Align = II->getRetAlign(); + if (MinAlign > Align.valueOrOne()) { + II->addRetAttr(Attribute::getWithAlignment(II->getContext(), MinAlign)); + return II; + } + break; + } default: { // Handle target specific intrinsics std::optional V = targetInstCombineIntrinsic(*II); diff --git a/llvm/test/Transforms/InstCombine/threadlocal_address.ll b/llvm/test/Transforms/InstCombine/threadlocal_address.ll new file mode 100644 index 00000000000000..0c220d996839e9 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/threadlocal_address.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -o - -S %s -passes=instcombine | FileCheck %s + +@tlsvar_a4 = thread_local global i32 4, align 4 + +define void @func_increase_alignment() { +; CHECK-LABEL: define void @func_increase_alignment() { +; CHECK-NEXT: [[P:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr @tlsvar_a4) +; CHECK-NEXT: store i32 42, ptr [[P]], align 2 +; CHECK-NEXT: ret void +; + %p = call align 2 ptr @llvm.threadlocal.address(ptr @tlsvar_a4) + store i32 42, ptr %p, align 2 + ret void +} + +@tlsvar_a32 = thread_local global i32 5, align 32 + +define i1 @func_add_alignment() { +; CHECK-LABEL: define i1 @func_add_alignment() { +; CHECK-NEXT: ret i1 true +; + %p = call ptr @llvm.threadlocal.address(ptr @tlsvar_a32) + %p_int = ptrtoint ptr %p to i32 + %lowbits = and i32 %p_int, 31 + %zero = icmp eq i32 %lowbits, 0 + ret i1 %zero +} + +@tlsvar_a1 = thread_local global i8 6, align 1 + +define i1 @func_dont_reduce_alignment() { +; CHECK-LABEL: define i1 @func_dont_reduce_alignment() { +; CHECK-NEXT: ret i1 true +; + %p = call align 4 ptr @llvm.threadlocal.address(ptr @tlsvar_a1) + %p_int = ptrtoint ptr %p to i32 + %lowbits = and i32 %p_int, 3 + %zero = icmp eq i32 %lowbits, 0 + ret i1 %zero +} From 2e26ee9dce32d12ffa9bfb7f7d3e97778f0b7a75 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 15 Apr 2024 18:22:15 -0700 Subject: [PATCH 020/300] [DWARF] Clarify a variable name. NFC (#88814) The parameter of `findDebugNamesOffsets` has been renamed to `EndOfHeaderOffset` in #88064 to make it clear it is a section offset instead of an offset relative to the current name index. Rename the call site variable as well. --- llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp index 22c9e8cd143c2e..ac19ac79329718 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp @@ -572,12 +572,12 @@ dwarf::findDebugNamesOffsets(uint64_t EndOfHeaderOffset, Error DWARFDebugNames::NameIndex::extract() { const DWARFDataExtractor &AS = Section.AccelSection; - uint64_t hdrSize = Base; - if (Error E = Hdr.extract(AS, &hdrSize)) + uint64_t EndOfHeaderOffset = Base; + if (Error E = Hdr.extract(AS, &EndOfHeaderOffset)) return E; const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format); - Offsets = dwarf::findDebugNamesOffsets(hdrSize, Hdr); + Offsets = dwarf::findDebugNamesOffsets(EndOfHeaderOffset, Hdr); uint64_t Offset = Offsets.EntryOffsetsBase + (Hdr.NameCount * SectionOffsetSize); From f3a8112d9839a166f7eb240c6c72d7ecd47d3560 Mon Sep 17 00:00:00 2001 From: Shih-Po Hung Date: Tue, 16 Apr 2024 09:37:32 +0800 Subject: [PATCH 021/300] [RISCV][TTI] Scale the cost of ICmp with LMUL (#88235) Use the Val type to estimate the instruction cost for ICmp. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 17 +- llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll | 320 +++++++++--------- .../LoopVectorize/RISCV/illegal-type.ll | 13 +- 3 files changed, 174 insertions(+), 176 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index bc9756c5e6ddad..56f5bd8794ae76 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1335,8 +1335,8 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); + std::pair LT = getTypeLegalizationCost(ValTy); if (Opcode == Instruction::Select && ValTy->isVectorTy()) { - std::pair LT = getTypeLegalizationCost(ValTy); if (CondTy->isVectorTy()) { if (ValTy->getScalarSizeInBits() == 1) { // vmandn.mm v8, v8, v9 @@ -1375,14 +1375,15 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, LT.second, CostKind); } - if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && - ValTy->isVectorTy()) { - std::pair LT = getTypeLegalizationCost(ValTy); - - // Support natively. - if (CmpInst::isIntPredicate(VecPred)) - return LT.first * 1; + if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() && + CmpInst::isIntPredicate(VecPred)) { + // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE + // provided they incur the same cost across all implementations + return LT.first * + getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind); + } + if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy()) { // If we do not support the input floating point vector type, use the base // one which will calculate as: // ScalarizeCost + Num * Cost for fixed vector, diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll index 27d24faf0a8dae..caa6d6f483a243 100644 --- a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll +++ b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll @@ -9,38 +9,38 @@ define void @icmp_eq() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp eq <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp eq <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp eq <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp eq <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp eq <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp eq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp eq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp eq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp eq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp eq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp eq undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp eq undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp eq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp eq <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp eq <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp eq <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp eq <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp eq <16 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp eq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp eq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp eq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp eq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp eq undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp eq undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp eq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp eq <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp eq <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp eq <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp eq <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp eq <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp eq <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp eq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp eq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp eq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp eq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp eq undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp eq undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp eq undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp eq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp eq <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp eq <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp eq <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp eq <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp eq <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp eq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp eq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp eq undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp eq undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp eq undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp eq undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp eq undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = icmp eq <2 x i8> undef, undef @@ -96,38 +96,38 @@ define void @icmp_ne() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp ne <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp ne <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp ne <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp ne <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp ne <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp ne undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp ne undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp ne undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp ne undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp ne undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp ne undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp ne <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp ne <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp ne <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp ne <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp ne <16 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp ne undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp ne undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp ne undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp ne undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp ne undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp ne <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp ne <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp ne <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp ne <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp ne <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp ne <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp ne undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp ne undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp ne undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp ne undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp ne undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp ne <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp ne <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp ne <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp ne <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp ne <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp ne undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp ne undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp ne undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp ne undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = icmp ne <2 x i8> undef, undef @@ -183,38 +183,38 @@ define void @icmp_ugt() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp ugt <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp ugt <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp ugt <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp ugt <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp ugt <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp ugt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp ugt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp ugt <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp ugt <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp ugt <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp ugt <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp ugt <16 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp ugt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp ugt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp ugt <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp ugt <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp ugt <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp ugt <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp ugt <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp ugt <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp ugt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp ugt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp ugt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp ugt <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp ugt <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp ugt <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp ugt <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp ugt <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp ugt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp ugt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp ugt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp ugt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp ugt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = icmp ugt <2 x i8> undef, undef @@ -270,38 +270,38 @@ define void @icmp_uge() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp uge <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp uge <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp uge <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp uge <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp uge <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp uge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp uge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp uge <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp uge <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp uge <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp uge <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp uge <16 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp uge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp uge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp uge <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp uge <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp uge <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp uge <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp uge <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp uge <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp uge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp uge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp uge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp uge <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp uge <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp uge <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp uge <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp uge <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp uge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp uge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp uge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp uge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp uge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = icmp uge <2 x i8> undef, undef @@ -357,38 +357,38 @@ define void @icmp_ult() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp ult <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp ult <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp ult <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp ult <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp ult <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp ult undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp ult undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp ult <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp ult <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp ult <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp ult <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp ult <16 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp ult undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp ult undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp ult <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp ult <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp ult <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp ult <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp ult <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp ult <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp ult undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp ult undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp ult undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp ult <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp ult <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp ult <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp ult <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp ult <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp ult undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp ult undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp ult undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp ult undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp ult undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = icmp ult <2 x i8> undef, undef @@ -444,38 +444,38 @@ define void @icmp_ule() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp ule <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp ule <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp ule <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp ule <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp ule <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp ule undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp ule undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp ule <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp ule <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp ule <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp ule <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp ule <16 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp ule undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp ule undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp ule <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp ule <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp ule <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp ule <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp ule <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp ule <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp ule undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp ule undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp ule undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp ule <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp ule <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp ule <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp ule <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp ule <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp ule undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp ule undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp ule undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp ule undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp ule undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = icmp ule <2 x i8> undef, undef @@ -531,38 +531,38 @@ define void @icmp_sgt() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp sgt <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp sgt <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp sgt <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp sgt <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp sgt <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp sgt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp sgt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp sgt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp sgt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp sgt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp sgt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp sgt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp sgt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp sgt <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp sgt <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp sgt <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp sgt <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp sgt <16 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp sgt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp sgt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp sgt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp sgt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp sgt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp sgt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp sgt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp sgt <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp sgt <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp sgt <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp sgt <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp sgt <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp sgt <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp sgt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp sgt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp sgt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp sgt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp sgt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp sgt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp sgt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp sgt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp sgt <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp sgt <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp sgt <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp sgt <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp sgt <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp sgt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp sgt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp sgt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp sgt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp sgt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp sgt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp sgt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = icmp sgt <2 x i8> undef, undef @@ -618,38 +618,38 @@ define void @icmp_sge() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp sge <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp sge <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp sge <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp sge <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp sge <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp sge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp sge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp sge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp sge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp sge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp sge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp sge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp sge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp sge <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp sge <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp sge <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp sge <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp sge <16 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp sge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp sge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp sge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp sge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp sge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp sge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp sge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp sge <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp sge <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp sge <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp sge <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp sge <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp sge <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp sge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp sge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp sge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp sge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp sge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp sge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp sge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp sge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp sge <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp sge <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp sge <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp sge <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp sge <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp sge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp sge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp sge undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp sge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp sge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp sge undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp sge undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = icmp sge <2 x i8> undef, undef @@ -705,38 +705,38 @@ define void @icmp_slt() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp slt <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp slt <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp slt <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp slt <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp slt <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp slt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp slt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp slt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp slt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp slt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp slt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp slt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp slt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp slt <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp slt <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp slt <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp slt <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp slt <16 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp slt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp slt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp slt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp slt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp slt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp slt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp slt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp slt <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp slt <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp slt <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp slt <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp slt <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp slt <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp slt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp slt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp slt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp slt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp slt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp slt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp slt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp slt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp slt <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp slt <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp slt <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp slt <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp slt <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp slt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp slt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp slt undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp slt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp slt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp slt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp slt undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = icmp slt <2 x i8> undef, undef @@ -792,38 +792,38 @@ define void @icmp_sle() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp sle <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp sle <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp sle <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp sle <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp sle <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp sle undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp sle undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp sle undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp sle undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp sle undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp sle undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp sle undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp sle undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp sle <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp sle <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp sle <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp sle <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp sle <16 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp sle undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp sle undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp sle undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp sle undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp sle undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp sle undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp sle undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp sle <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp sle <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp sle <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp sle <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp sle <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp sle <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp sle undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp sle undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp sle undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp sle undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp sle undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp sle undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp sle undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp sle undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp sle <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp sle <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp sle <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp sle <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp sle <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp sle undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp sle undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp sle undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp sle undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp sle undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp sle undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp sle undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2i8 = icmp sle <2 x i8> undef, undef diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll index c49f7d4a5d5f13..eeef8f199353b8 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll @@ -102,10 +102,10 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) { ; CHECK-LABEL: @uniform_store_i1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 64 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[TMP1]] @@ -116,15 +116,12 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) { ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP2]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP3]], i64 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <32 x ptr> [[TMP4]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP2]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <32 x ptr> [[TMP5]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <32 x i1> [[TMP7]], i32 31 ; CHECK-NEXT: store i1 [[TMP8]], ptr [[DST:%.*]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 512 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 256 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: middle.block: From dfe12b3fd193318403622a8ae51e0362c27502d1 Mon Sep 17 00:00:00 2001 From: Benji Smith <6193112+Benjins@users.noreply.github.com> Date: Mon, 15 Apr 2024 21:37:55 -0400 Subject: [PATCH 022/300] [C API] Support uinc_wrap/udec_wrap in atomicrmw when accessing the bin op (#87163) These previously were added in the C++ API in 778cf5431cafc243f81dd5c8cbd27701ff7f9120, but without updating the enum in the C API or mapping functions. Corresponding tests for all current atomicrmw bin ops have been added as well. --- llvm/docs/ReleaseNotes.rst | 3 ++ llvm/include/llvm-c/Core.h | 60 +++++++++++++++------------- llvm/lib/IR/Core.cpp | 8 ++++ llvm/test/Bindings/llvm-c/atomics.ll | 25 ++++++++++++ 4 files changed, 68 insertions(+), 28 deletions(-) diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 11add9274a7a07..76ef6ceb940780 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -149,6 +149,9 @@ Changes to the C API * Deprecated ``LLVMConstNUWNeg`` and ``LLVMBuildNUWNeg``. +* Added ``LLVMAtomicRMWBinOpUIncWrap`` and ``LLVMAtomicRMWBinOpUDecWrap`` to + ``LLVMAtomicRMWBinOp`` enum for AtomicRMW instructions. + Changes to the CodeGen infrastructure ------------------------------------- diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index 6be5957ce61033..0b03f3b36fcdd3 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -361,35 +361,39 @@ typedef enum { } LLVMAtomicOrdering; typedef enum { - LLVMAtomicRMWBinOpXchg, /**< Set the new value and return the one old */ - LLVMAtomicRMWBinOpAdd, /**< Add a value and return the old one */ - LLVMAtomicRMWBinOpSub, /**< Subtract a value and return the old one */ - LLVMAtomicRMWBinOpAnd, /**< And a value and return the old one */ - LLVMAtomicRMWBinOpNand, /**< Not-And a value and return the old one */ - LLVMAtomicRMWBinOpOr, /**< OR a value and return the old one */ - LLVMAtomicRMWBinOpXor, /**< Xor a value and return the old one */ - LLVMAtomicRMWBinOpMax, /**< Sets the value if it's greater than the - original using a signed comparison and return - the old one */ - LLVMAtomicRMWBinOpMin, /**< Sets the value if it's Smaller than the - original using a signed comparison and return - the old one */ - LLVMAtomicRMWBinOpUMax, /**< Sets the value if it's greater than the - original using an unsigned comparison and return - the old one */ - LLVMAtomicRMWBinOpUMin, /**< Sets the value if it's greater than the - original using an unsigned comparison and return - the old one */ - LLVMAtomicRMWBinOpFAdd, /**< Add a floating point value and return the - old one */ - LLVMAtomicRMWBinOpFSub, /**< Subtract a floating point value and return the + LLVMAtomicRMWBinOpXchg, /**< Set the new value and return the one old */ + LLVMAtomicRMWBinOpAdd, /**< Add a value and return the old one */ + LLVMAtomicRMWBinOpSub, /**< Subtract a value and return the old one */ + LLVMAtomicRMWBinOpAnd, /**< And a value and return the old one */ + LLVMAtomicRMWBinOpNand, /**< Not-And a value and return the old one */ + LLVMAtomicRMWBinOpOr, /**< OR a value and return the old one */ + LLVMAtomicRMWBinOpXor, /**< Xor a value and return the old one */ + LLVMAtomicRMWBinOpMax, /**< Sets the value if it's greater than the + original using a signed comparison and return + the old one */ + LLVMAtomicRMWBinOpMin, /**< Sets the value if it's Smaller than the + original using a signed comparison and return + the old one */ + LLVMAtomicRMWBinOpUMax, /**< Sets the value if it's greater than the + original using an unsigned comparison and return + the old one */ + LLVMAtomicRMWBinOpUMin, /**< Sets the value if it's greater than the + original using an unsigned comparison and return + the old one */ + LLVMAtomicRMWBinOpFAdd, /**< Add a floating point value and return the old one */ - LLVMAtomicRMWBinOpFMax, /**< Sets the value if it's greater than the - original using an floating point comparison and - return the old one */ - LLVMAtomicRMWBinOpFMin, /**< Sets the value if it's smaller than the - original using an floating point comparison and - return the old one */ + LLVMAtomicRMWBinOpFSub, /**< Subtract a floating point value and return the + old one */ + LLVMAtomicRMWBinOpFMax, /**< Sets the value if it's greater than the + original using an floating point comparison and + return the old one */ + LLVMAtomicRMWBinOpFMin, /**< Sets the value if it's smaller than the + original using an floating point comparison and + return the old one */ + LLVMAtomicRMWBinOpUIncWrap, /**< Increments the value, wrapping back to zero + when incremented above input value */ + LLVMAtomicRMWBinOpUDecWrap, /**< Decrements the value, wrapping back to + the input value when decremented below zero */ } LLVMAtomicRMWBinOp; typedef enum { diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 8ce9c5ca63bede..6aff94f39d9c0c 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -3769,6 +3769,10 @@ static AtomicRMWInst::BinOp mapFromLLVMRMWBinOp(LLVMAtomicRMWBinOp BinOp) { case LLVMAtomicRMWBinOpFSub: return AtomicRMWInst::FSub; case LLVMAtomicRMWBinOpFMax: return AtomicRMWInst::FMax; case LLVMAtomicRMWBinOpFMin: return AtomicRMWInst::FMin; + case LLVMAtomicRMWBinOpUIncWrap: + return AtomicRMWInst::UIncWrap; + case LLVMAtomicRMWBinOpUDecWrap: + return AtomicRMWInst::UDecWrap; } llvm_unreachable("Invalid LLVMAtomicRMWBinOp value!"); @@ -3791,6 +3795,10 @@ static LLVMAtomicRMWBinOp mapToLLVMRMWBinOp(AtomicRMWInst::BinOp BinOp) { case AtomicRMWInst::FSub: return LLVMAtomicRMWBinOpFSub; case AtomicRMWInst::FMax: return LLVMAtomicRMWBinOpFMax; case AtomicRMWInst::FMin: return LLVMAtomicRMWBinOpFMin; + case AtomicRMWInst::UIncWrap: + return LLVMAtomicRMWBinOpUIncWrap; + case AtomicRMWInst::UDecWrap: + return LLVMAtomicRMWBinOpUDecWrap; default: break; } diff --git a/llvm/test/Bindings/llvm-c/atomics.ll b/llvm/test/Bindings/llvm-c/atomics.ll index e64a29944ef9df..162368c9d98d0e 100644 --- a/llvm/test/Bindings/llvm-c/atomics.ll +++ b/llvm/test/Bindings/llvm-c/atomics.ll @@ -36,6 +36,31 @@ define void @atomic_load_store(ptr %word) { ret void } +define void @atomic_rmw_ops(ptr %p, i32 %i, float %f) { + ; Test all atomicrmw operations + %a.xchg = atomicrmw xchg ptr %p, i32 %i acq_rel, align 8 + %a.add = atomicrmw add ptr %p, i32 %i acq_rel, align 8 + %a.sub = atomicrmw sub ptr %p, i32 %i acq_rel, align 8 + %a.and = atomicrmw and ptr %p, i32 %i acq_rel, align 8 + %a.nand = atomicrmw nand ptr %p, i32 %i acq_rel, align 8 + %a.or = atomicrmw or ptr %p, i32 %i acq_rel, align 8 + %a.xor = atomicrmw xor ptr %p, i32 %i acq_rel, align 8 + %a.max = atomicrmw max ptr %p, i32 %i acq_rel, align 8 + %a.min = atomicrmw min ptr %p, i32 %i acq_rel, align 8 + %a.umax = atomicrmw umax ptr %p, i32 %i acq_rel, align 8 + %a.umin = atomicrmw umin ptr %p, i32 %i acq_rel, align 8 + + %a.fadd = atomicrmw fadd ptr %p, float %f acq_rel, align 8 + %a.fsub = atomicrmw fsub ptr %p, float %f acq_rel, align 8 + %a.fmax = atomicrmw fmax ptr %p, float %f acq_rel, align 8 + %a.fmin = atomicrmw fmin ptr %p, float %f acq_rel, align 8 + + %a.uinc_wrap = atomicrmw uinc_wrap ptr %p, i32 %i acq_rel, align 8 + %a.udec_wrap = atomicrmw udec_wrap ptr %p, i32 %i acq_rel, align 8 + + ret void +} + define i32 @main() { %1 = alloca i32, align 4 %2 = cmpxchg ptr %1, i32 2, i32 3 seq_cst acquire From 5b9af38a03c8119cc2a42ae80d4a25e6f454c721 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Apr 2024 18:38:54 -0700 Subject: [PATCH 023/300] [RISCV] Provide a more efficient lowering for experimental.cttz.elts. (#88552) For experimental.cttz.elts, we can use a vfirst instruction, but we need to correct the result if input vector can be 0. cttz.elts returns the vector length while vfirst returns -1. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 36 ++++++ llvm/lib/Target/RISCV/RISCVISelLowering.h | 2 + .../RISCV/intrinsic-cttz-elts-vscale.ll | 114 ++++++++++++++---- .../test/CodeGen/RISCV/intrinsic-cttz-elts.ll | 22 +--- 4 files changed, 132 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 259cc388276c69..f6ed6420c9e1fa 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1484,6 +1484,11 @@ bool RISCVTargetLowering::shouldExpandGetVectorLength(EVT TripCountVT, return VF > MaxVF || !isPowerOf2_32(VF); } +bool RISCVTargetLowering::shouldExpandCttzElements(EVT VT) const { + return !Subtarget.hasVInstructions() || + VT.getVectorElementType() != MVT::i1 || !isTypeLegal(VT); +} + bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, @@ -8718,6 +8723,29 @@ static SDValue lowerGetVectorLength(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res); } +static SDValue lowerCttzElts(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + SDValue Op0 = N->getOperand(1); + MVT OpVT = Op0.getSimpleValueType(); + MVT ContainerVT = OpVT; + if (OpVT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(DAG, OpVT, Subtarget); + Op0 = convertToScalableVector(ContainerVT, Op0, DAG, Subtarget); + } + MVT XLenVT = Subtarget.getXLenVT(); + SDLoc DL(N); + auto [Mask, VL] = getDefaultVLOps(OpVT, ContainerVT, DL, DAG, Subtarget); + SDValue Res = DAG.getNode(RISCVISD::VFIRST_VL, DL, XLenVT, Op0, Mask, VL); + if (isOneConstant(N->getOperand(2))) + return Res; + + // Convert -1 to VL. + SDValue Setcc = + DAG.getSetCC(DL, XLenVT, Res, DAG.getConstant(0, DL, XLenVT), ISD::SETLT); + VL = DAG.getElementCount(DL, XLenVT, OpVT.getVectorElementCount()); + return DAG.getSelect(DL, XLenVT, Setcc, VL, Res); +} + static inline void promoteVCIXScalar(const SDValue &Op, SmallVectorImpl &Operands, SelectionDAG &DAG) { @@ -8913,6 +8941,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::experimental_get_vector_length: return lowerGetVectorLength(Op.getNode(), DAG, Subtarget); + case Intrinsic::experimental_cttz_elts: + return lowerCttzElts(Op.getNode(), DAG, Subtarget); case Intrinsic::riscv_vmv_x_s: { SDValue Res = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Op.getOperand(1)); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res); @@ -12336,6 +12366,12 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); return; } + case Intrinsic::experimental_cttz_elts: { + SDValue Res = lowerCttzElts(N, DAG, Subtarget); + Results.push_back( + DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res)); + return; + } case Intrinsic::riscv_orc_b: case Intrinsic::riscv_brev8: case Intrinsic::riscv_sha256sig0: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index ace5b3fd2b95b4..e2633733c31b19 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -986,6 +986,8 @@ class RISCVTargetLowering : public TargetLowering { bool shouldExpandGetVectorLength(EVT TripCountVT, unsigned VF, bool IsScalable) const override; + bool shouldExpandCttzElements(EVT VT) const override; + /// RVV code generation for fixed length vectors does not lower all /// BUILD_VECTORs. This makes BUILD_VECTOR legalisation a source of stores to /// merge. However, merging them creates a BUILD_VECTOR that is just as diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll index 65d0768c60885d..ea8feef3329840 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll @@ -128,43 +128,113 @@ define i64 @ctz_nxv8i1_no_range( %a) { define i32 @ctz_nxv16i1( %pg, %a) { ; RV32-LABEL: ctz_nxv16i1: ; RV32: # %bb.0: -; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV32-NEXT: vfirst.m a0, v8 +; RV32-NEXT: bgez a0, .LBB2_2 +; RV32-NEXT: # %bb.1: ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 1 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vid.v v16 -; RV32-NEXT: li a1, -1 -; RV32-NEXT: vmadd.vx v16, a1, v8 -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV32-NEXT: vredmaxu.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a1, v8 -; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: .LBB2_2: ; RV32-NEXT: ret ; ; RV64-LABEL: ctz_nxv16i1: ; RV64: # %bb.0: -; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV64-NEXT: vfirst.m a0, v8 +; RV64-NEXT: bgez a0, .LBB2_2 +; RV64-NEXT: # %bb.1: ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma -; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vid.v v16 -; RV64-NEXT: li a1, -1 -; RV64-NEXT: vmadd.vx v16, a1, v8 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV64-NEXT: vredmaxu.vs v8, v8, v8 -; RV64-NEXT: vmv.x.s a1, v8 -; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: .LBB2_2: ; RV64-NEXT: ret %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( %a, i1 0) ret i32 %res } +define i32 @ctz_nxv16i1_poison( %pg, %a) { +; RV32-LABEL: ctz_nxv16i1_poison: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV32-NEXT: vfirst.m a0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: ctz_nxv16i1_poison: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; RV64-NEXT: vfirst.m a0, v8 +; RV64-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( %a, i1 1) + ret i32 %res +} + +define i32 @ctz_v16i1(<16 x i1> %pg, <16 x i1> %a) { +; RV32-LABEL: ctz_v16i1: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vfirst.m a0, v8 +; RV32-NEXT: bgez a0, .LBB4_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 16 +; RV32-NEXT: .LBB4_2: +; RV32-NEXT: ret +; +; RV64-LABEL: ctz_v16i1: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vfirst.m a0, v8 +; RV64-NEXT: bgez a0, .LBB4_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a0, 16 +; RV64-NEXT: .LBB4_2: +; RV64-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0) + ret i32 %res +} + +define i32 @ctz_v16i1_poison(<16 x i1> %pg, <16 x i1> %a) { +; RV32-LABEL: ctz_v16i1_poison: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vfirst.m a0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: ctz_v16i1_poison: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vfirst.m a0, v8 +; RV64-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1) + ret i32 %res +} + +define i16 @ctz_v8i1_i16_ret(<8 x i1> %a) { +; RV32-LABEL: ctz_v8i1_i16_ret: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vfirst.m a0, v0 +; RV32-NEXT: bgez a0, .LBB6_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 8 +; RV32-NEXT: .LBB6_2: +; RV32-NEXT: ret +; +; RV64-LABEL: ctz_v8i1_i16_ret: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: vfirst.m a0, v0 +; RV64-NEXT: bgez a0, .LBB6_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a0, 8 +; RV64-NEXT: .LBB6_2: +; RV64-NEXT: ret + %res = call i16 @llvm.experimental.cttz.elts.i16.v8i1(<8 x i1> %a, i1 0) + ret i16 %res +} + declare i64 @llvm.experimental.cttz.elts.i64.nxv8i16(, i1) declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(, i1) declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(, i1) +declare i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1>, i1) +declare i16 @llvm.experimental.cttz.elts.i16.v16i1(<8 x i1>, i1) attributes #0 = { vscale_range(2,1024) } diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll index 49d4760a2e9abf..94b717b42e92b6 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll @@ -48,31 +48,13 @@ define i32 @ctz_v2i1_poison(<2 x i1> %a) { ; RV32-LABEL: ctz_v2i1_poison: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vim v8, v8, -1, v0 -; RV32-NEXT: vid.v v9 -; RV32-NEXT: vrsub.vi v9, v9, 2 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vredmaxu.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 2 -; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: andi a0, a1, 255 +; RV32-NEXT: vfirst.m a0, v0 ; RV32-NEXT: ret ; ; RV64-LABEL: ctz_v2i1_poison: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vim v8, v8, -1, v0 -; RV64-NEXT: vid.v v9 -; RV64-NEXT: vrsub.vi v9, v9, 2 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vredmaxu.vs v8, v8, v8 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: li a1, 2 -; RV64-NEXT: subw a1, a1, a0 -; RV64-NEXT: andi a0, a1, 255 +; RV64-NEXT: vfirst.m a0, v0 ; RV64-NEXT: ret %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1) ret i32 %res From 4b22a923c4bfd0aa1d483149f84b6787263c2d76 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 16 Apr 2024 10:42:32 +0900 Subject: [PATCH 024/300] [IndVars] Mark truncs as nuw/nsw (#88686) When inserting truncs during IV widening, mark the trunc as either nuw or nsw depending on whether zext or sext widening was used. For non-negative IVs both nuw and nsw apply. --- llvm/include/llvm/IR/IRBuilder.h | 14 ++++++- llvm/lib/Transforms/Utils/SimplifyIndVar.cpp | 40 ++++++++++--------- .../IndVarSimplify/AArch64/widen-loop-comp.ll | 8 ++-- .../Transforms/IndVarSimplify/X86/iv-widen.ll | 16 ++++---- .../Transforms/IndVarSimplify/elim-extend.ll | 2 +- ...wide-inc-for-narrow-use-recompute-flags.ll | 2 +- .../test/Transforms/IndVarSimplify/iv-sext.ll | 2 +- .../IndVarSimplify/iv-widen-elim-ext.ll | 8 ++-- llvm/test/Transforms/IndVarSimplify/lftr.ll | 4 +- .../IndVarSimplify/no-iv-rewrite.ll | 2 +- .../IndVarSimplify/post-inc-range.ll | 2 +- .../test/Transforms/IndVarSimplify/pr25578.ll | 2 +- .../test/Transforms/IndVarSimplify/pr55925.ll | 8 ++-- .../widen-nonnegative-countdown.ll | 22 +++++----- .../IndVarSimplify/widen-nonnegative.ll | 20 +++++----- llvm/test/Transforms/LoopFlatten/widen-iv3.ll | 2 +- 16 files changed, 84 insertions(+), 70 deletions(-) diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index f381273c46cfb8..b6534a1962a2f5 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2004,8 +2004,18 @@ class IRBuilderBase { // Instruction creation methods: Cast/Conversion Operators //===--------------------------------------------------------------------===// - Value *CreateTrunc(Value *V, Type *DestTy, const Twine &Name = "") { - return CreateCast(Instruction::Trunc, V, DestTy, Name); + Value *CreateTrunc(Value *V, Type *DestTy, const Twine &Name = "", + bool IsNUW = false, bool IsNSW = false) { + if (V->getType() == DestTy) + return V; + if (Value *Folded = Folder.FoldCast(Instruction::Trunc, V, DestTy)) + return Folded; + Instruction *I = CastInst::Create(Instruction::Trunc, V, DestTy); + if (IsNUW) + I->setHasNoUnsignedWrap(); + if (IsNSW) + I->setHasNoSignedWrap(); + return Insert(I, Name); } Value *CreateZExt(Value *V, Type *DestTy, const Twine &Name = "", diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index 440fe0790d7950..31be7d62c8d1d8 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -1153,6 +1153,7 @@ class WidenIV { Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter, PHINode *OrigPhi, PHINode *WidePhi); + void truncateIVUse(NarrowIVDefUse DU); bool widenLoopCompare(NarrowIVDefUse DU); bool widenWithVariantUse(NarrowIVDefUse DU); @@ -1569,15 +1570,18 @@ WidenIV::WidenedRecTy WidenIV::getWideRecurrence(WidenIV::NarrowIVDefUse DU) { /// This IV user cannot be widened. Replace this use of the original narrow IV /// with a truncation of the new wide IV to isolate and eliminate the narrow IV. -static void truncateIVUse(WidenIV::NarrowIVDefUse DU, DominatorTree *DT, - LoopInfo *LI) { +void WidenIV::truncateIVUse(NarrowIVDefUse DU) { auto *InsertPt = getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI); if (!InsertPt) return; LLVM_DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user " << *DU.NarrowUse << "\n"); + ExtendKind ExtKind = getExtendKind(DU.NarrowDef); IRBuilder<> Builder(InsertPt); - Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType()); + Value *Trunc = + Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType(), "", + DU.NeverNegative || ExtKind == ExtendKind::Zero, + DU.NeverNegative || ExtKind == ExtendKind::Sign); DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc); } @@ -1826,6 +1830,13 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, assert(ExtendKindMap.count(DU.NarrowDef) && "Should already know the kind of extension used to widen NarrowDef"); + // This narrow use can be widened by a sext if it's non-negative or its narrow + // def was widened by a sext. Same for zext. + bool CanWidenBySExt = + DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Sign; + bool CanWidenByZExt = + DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Zero; + // Stop traversing the def-use chain at inner-loop phis or post-loop phis. if (PHINode *UsePhi = dyn_cast(DU.NarrowUse)) { if (LI->getLoopFor(UsePhi->getParent()) != L) { @@ -1833,7 +1844,7 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, // After SimplifyCFG most loop exit targets have a single predecessor. // Otherwise fall back to a truncate within the loop. if (UsePhi->getNumOperands() != 1) - truncateIVUse(DU, DT, LI); + truncateIVUse(DU); else { // Widening the PHI requires us to insert a trunc. The logical place // for this trunc is in the same BB as the PHI. This is not possible if @@ -1847,7 +1858,8 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0)); BasicBlock *WidePhiBB = WidePhi->getParent(); IRBuilder<> Builder(WidePhiBB, WidePhiBB->getFirstInsertionPt()); - Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType()); + Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType(), "", + CanWidenByZExt, CanWidenBySExt); UsePhi->replaceAllUsesWith(Trunc); DeadInsts.emplace_back(UsePhi); LLVM_DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi << " to " @@ -1857,18 +1869,9 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, } } - // This narrow use can be widened by a sext if it's non-negative or its narrow - // def was widened by a sext. Same for zext. - auto canWidenBySExt = [&]() { - return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Sign; - }; - auto canWidenByZExt = [&]() { - return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Zero; - }; - // Our raison d'etre! Eliminate sign and zero extension. - if ((match(DU.NarrowUse, m_SExtLike(m_Value())) && canWidenBySExt()) || - (isa(DU.NarrowUse) && canWidenByZExt())) { + if ((match(DU.NarrowUse, m_SExtLike(m_Value())) && CanWidenBySExt) || + (isa(DU.NarrowUse) && CanWidenByZExt)) { Value *NewDef = DU.WideDef; if (DU.NarrowUse->getType() != WideType) { unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType()); @@ -1876,7 +1879,8 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, if (CastWidth < IVWidth) { // The cast isn't as wide as the IV, so insert a Trunc. IRBuilder<> Builder(DU.NarrowUse); - NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType()); + NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType(), "", + CanWidenByZExt, CanWidenBySExt); } else { // A wider extend was hidden behind a narrower one. This may induce @@ -1975,7 +1979,7 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, // This user does not evaluate to a recurrence after widening, so don't // follow it. Instead insert a Trunc to kill off the original use, // eventually isolating the original narrow IV so it can be removed. - truncateIVUse(DU, DT, LI); + truncateIVUse(DU); return nullptr; } diff --git a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll index 6f659a88da2e2b..c5f656c870a23a 100644 --- a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll +++ b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll @@ -41,7 +41,7 @@ define i32 @test1() { ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_COND]] ; CHECK: if.then: ; CHECK-NEXT: [[I_05_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[I_05_LCSSA_WIDE]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw nsw i64 [[I_05_LCSSA_WIDE]] to i32 ; CHECK-NEXT: store i32 [[TMP5]], ptr @idx, align 4 ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: for.cond.for.end.loopexit_crit_edge: @@ -237,7 +237,7 @@ define i32 @test4(i32 %a) { ; CHECK-NEXT: [[CONV3:%.*]] = trunc i32 [[OR]] to i8 ; CHECK-NEXT: [[CALL:%.*]] = call i32 @fn1(i8 signext [[CONV3]]) ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i32 [[INDVARS_IV]], -1 -; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[INDVARS_IV_NEXT]] to i8 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw i32 [[INDVARS_IV_NEXT]] to i8 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[TMP0]], -14 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: @@ -466,7 +466,7 @@ define i32 @test9(ptr %a, i32 %b, i32 %init) { ; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[SUM_0]], [[TMP1]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nuw i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 0, [[TMP2]] ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[FOR_END]] ; CHECK: for.end: @@ -997,7 +997,7 @@ define i32 @test16_unsigned_neg(i32 %start, ptr %p, ptr %q, i32 %x) { ; CHECK: loop: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[FOO:%.*]] = add i32 [[TMP1]], -1 ; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[GUARDED:%.*]] ; CHECK: guarded: diff --git a/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll b/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll index d05755bea0dddc..4e0c503794bfe4 100644 --- a/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll @@ -23,7 +23,7 @@ define void @loop_0(ptr %a) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[B18_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[B24:%.*]] ] ; CHECK-NEXT: call void @use(i64 [[INDVARS_IV]]) ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[O:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[O]], align 4 ; CHECK-NEXT: [[T:%.*]] = icmp eq i32 [[V]], 0 @@ -37,7 +37,7 @@ define void @loop_0(ptr %a) { ; CHECK-NEXT: ret void ; CHECK: exit24: ; CHECK-NEXT: [[DOT02_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV]], [[B18]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[DOT02_LCSSA_WIDE]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw nsw i64 [[DOT02_LCSSA_WIDE]] to i32 ; CHECK-NEXT: call void @dummy(i32 [[TMP1]]) ; CHECK-NEXT: unreachable ; @@ -159,7 +159,7 @@ declare void @dummy(i32) declare void @dummy.i64(i64) -define void @loop_2(i32 %size, i32 %nsteps, i32 %hsize, ptr %lined, i8 %tmp1) { +define void @loop_2(i32 %size, i32 %nsteps, i32 %hsize, ptr %lined, i8 %arg) { ; CHECK-LABEL: @loop_2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP215:%.*]] = icmp sgt i32 [[SIZE:%.*]], 1 @@ -180,12 +180,12 @@ define void @loop_2(i32 %size, i32 %nsteps, i32 %hsize, ptr %lined, i8 %tmp1) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 1, [[FOR_BODY2_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY2]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP3]], [[INDVARS_IV]] ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[LINED:%.*]], i64 [[TMP4]] -; CHECK-NEXT: store i8 [[TMP1:%.*]], ptr [[ADD_PTR]], align 1 +; CHECK-NEXT: store i8 [[ARG:%.*]], ptr [[ADD_PTR]], align 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY2]], label [[FOR_BODY3_PREHEADER:%.*]] ; CHECK: for.body3.preheader: -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nsw i64 [[TMP3]] to i32 ; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 ; CHECK-NEXT: [[WIDE_TRIP_COUNT7:%.*]] = zext i32 [[SIZE]] to i64 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] @@ -193,7 +193,7 @@ define void @loop_2(i32 %size, i32 %nsteps, i32 %hsize, ptr %lined, i8 %tmp1) { ; CHECK-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ 1, [[FOR_BODY3_PREHEADER]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], [[INDVARS_IV3]] ; CHECK-NEXT: [[ADD_PTR2:%.*]] = getelementptr inbounds i8, ptr [[LINED]], i64 [[TMP7]] -; CHECK-NEXT: store i8 [[TMP1]], ptr [[ADD_PTR2]], align 1 +; CHECK-NEXT: store i8 [[ARG]], ptr [[ADD_PTR2]], align 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1 ; CHECK-NEXT: [[EXITCOND8:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT4]], [[WIDE_TRIP_COUNT7]] ; CHECK-NEXT: br i1 [[EXITCOND8]], label [[FOR_BODY3]], label [[FOR_INC_LOOPEXIT:%.*]] @@ -222,7 +222,7 @@ for.body2: %add4 = add nsw i32 %add, %k %idx.ext = sext i32 %add4 to i64 %add.ptr = getelementptr inbounds i8, ptr %lined, i64 %idx.ext - store i8 %tmp1, ptr %add.ptr, align 1 + store i8 %arg, ptr %add.ptr, align 1 %inc = add nsw i32 %k, 1 %cmp2 = icmp slt i32 %inc, %size br i1 %cmp2, label %for.body2, label %for.body3 @@ -233,7 +233,7 @@ for.body3: %add5 = add nuw i32 %add, %l %idx.ext2 = zext i32 %add5 to i64 %add.ptr2 = getelementptr inbounds i8, ptr %lined, i64 %idx.ext2 - store i8 %tmp1, ptr %add.ptr2, align 1 + store i8 %arg, ptr %add.ptr2, align 1 %inc2 = add nsw i32 %l, 1 %cmp3 = icmp slt i32 %inc2, %size br i1 %cmp3, label %for.body3, label %for.inc diff --git a/llvm/test/Transforms/IndVarSimplify/elim-extend.ll b/llvm/test/Transforms/IndVarSimplify/elim-extend.ll index 54bb9951ff66ab..01c95dadd16261 100644 --- a/llvm/test/Transforms/IndVarSimplify/elim-extend.ll +++ b/llvm/test/Transforms/IndVarSimplify/elim-extend.ll @@ -142,7 +142,7 @@ define void @nestedIV(ptr %address, i32 %limit) nounwind { ; CHECK-NEXT: br i1 [[EXITCOND]], label [[INNERLOOP]], label [[INNEREXIT:%.*]] ; CHECK: innerexit: ; CHECK-NEXT: [[INNERCOUNT_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV_NEXT]], [[INNERLOOP]] ] -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INNERCOUNT_LCSSA_WIDE]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = trunc nsw i64 [[INNERCOUNT_LCSSA_WIDE]] to i32 ; CHECK-NEXT: br label [[OUTERMERGE]] ; CHECK: outermerge: ; CHECK-NEXT: [[INNERCOUNT_MERGE]] = phi i32 [ [[TMP3]], [[INNEREXIT]] ], [ [[INNERCOUNT]], [[INNERPREHEADER]] ] diff --git a/llvm/test/Transforms/IndVarSimplify/hoist-wide-inc-for-narrow-use-recompute-flags.ll b/llvm/test/Transforms/IndVarSimplify/hoist-wide-inc-for-narrow-use-recompute-flags.ll index cc99ee312ccb7f..1135ca9dbf00dc 100644 --- a/llvm/test/Transforms/IndVarSimplify/hoist-wide-inc-for-narrow-use-recompute-flags.ll +++ b/llvm/test/Transforms/IndVarSimplify/hoist-wide-inc-for-narrow-use-recompute-flags.ll @@ -15,7 +15,7 @@ define void @test_pr82243(ptr %f) { ; CHECK-NEXT: [[GEP_IV_EXT:%.*]] = getelementptr i32, ptr [[F]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i32 1, ptr [[GEP_IV_EXT]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[SHL:%.*]] = shl i32 123, [[TMP0]] ; CHECK-NEXT: [[GEP_SHL:%.*]] = getelementptr i32, ptr [[F]], i32 [[SHL]] ; CHECK-NEXT: br label [[INNER_HEADER:%.*]] diff --git a/llvm/test/Transforms/IndVarSimplify/iv-sext.ll b/llvm/test/Transforms/IndVarSimplify/iv-sext.ll index 450913f16baa29..95a036f0e54c7c 100644 --- a/llvm/test/Transforms/IndVarSimplify/iv-sext.ll +++ b/llvm/test/Transforms/IndVarSimplify/iv-sext.ll @@ -99,7 +99,7 @@ define void @t(ptr %pval1, ptr %peakWeight, ptr %nrgReducePeakrate, i32 %bandEdg ; CHECK-NEXT: [[VAL35_LCSSA:%.*]] = phi float [ [[VAL35]], [[BB5]] ] ; CHECK-NEXT: [[VAL31_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV_NEXT]], [[BB5]] ] ; CHECK-NEXT: [[VAL30_LCSSA:%.*]] = phi float [ [[VAL30]], [[BB5]] ] -; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[VAL31_LCSSA_WIDE]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc nsw i64 [[VAL31_LCSSA_WIDE]] to i32 ; CHECK-NEXT: br label [[BB7]] ; CHECK: bb7: ; CHECK-NEXT: [[DISTERBHI_2_LCSSA]] = phi float [ [[VAL30_LCSSA]], [[BB5_BB7_CRIT_EDGE]] ], [ [[DISTERBHI_0_PH]], [[BB5_PREHEADER]] ] diff --git a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll index 59a0241bfe9fde..a83e9ce74b12ab 100644 --- a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll +++ b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll @@ -22,7 +22,7 @@ define void @foo(ptr %A, ptr %B, ptr %C, i32 %N) { ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP0]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 ; CHECK-NEXT: [[DIV0:%.*]] = udiv i32 5, [[TMP3]] ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[ADD3]], [[DIV0]] ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] @@ -224,7 +224,7 @@ define i32 @foo3(i32 %M) { ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc nsw i64 [[TMP3]] to i32 ; CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP4]] to i64 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x i32], ptr @a, i64 0, i64 [[IDXPROM4]] ; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 @@ -365,7 +365,7 @@ define i32 @foo5(ptr %input, i32 %length, ptr %in) { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_LR_PH]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INPUT]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = trunc nuw nsw i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 [[IDX_EXT]] @@ -514,7 +514,7 @@ define void @foo7(i32 %n, ptr %a, i32 %x) { ; CHECK-NEXT: [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP2]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: store i32 [[TMP4]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[TMP1]] diff --git a/llvm/test/Transforms/IndVarSimplify/lftr.ll b/llvm/test/Transforms/IndVarSimplify/lftr.ll index 41db925de577ea..7f4820f093e55e 100644 --- a/llvm/test/Transforms/IndVarSimplify/lftr.ll +++ b/llvm/test/Transforms/IndVarSimplify/lftr.ll @@ -525,7 +525,7 @@ define float @wide_trip_count_test3(ptr %b, ; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[INDVARS_IV]], 20 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TEMP:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to float ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[TEMP]] ; CHECK-NEXT: [[ADD1]] = fadd float [[SUM_07]], [[MUL]] @@ -584,7 +584,7 @@ define float @wide_trip_count_test4(ptr %b, ; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 20 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TEMP:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to float ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[TEMP]] ; CHECK-NEXT: [[ADD1]] = fadd float [[SUM_07]], [[MUL]] diff --git a/llvm/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/llvm/test/Transforms/IndVarSimplify/no-iv-rewrite.ll index c35c5bacf68ca5..579b8536cedf03 100644 --- a/llvm/test/Transforms/IndVarSimplify/no-iv-rewrite.ll +++ b/llvm/test/Transforms/IndVarSimplify/no-iv-rewrite.ll @@ -213,7 +213,7 @@ define void @maxvisitor(i32 %limit, ptr %base) nounwind { ; CHECK-NEXT: [[CMP19:%.*]] = icmp sgt i32 [[VAL]], [[MAX]] ; CHECK-NEXT: br i1 [[CMP19]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: br label [[LOOP_INC]] ; CHECK: if.else: ; CHECK-NEXT: br label [[LOOP_INC]] diff --git a/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll b/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll index 5c22ba1044b60a..bbdee0267effb7 100644 --- a/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll +++ b/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll @@ -180,7 +180,7 @@ define void @test_neg(ptr %array_length_ptr, ptr %base, ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nuw i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP2]], [[LIMIT:%.*]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]] ; CHECK: for.end: diff --git a/llvm/test/Transforms/IndVarSimplify/pr25578.ll b/llvm/test/Transforms/IndVarSimplify/pr25578.ll index d8adc178474c02..380e8171798b01 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr25578.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr25578.ll @@ -13,7 +13,7 @@ L1_header: ; CHECK: L2_header: ; CHECK: %[[INDVAR:.*]] = phi i64 -; CHECK: %[[TRUNC:.*]] = trunc i64 %[[INDVAR]] to i32 +; CHECK: %[[TRUNC:.*]] = trunc nuw nsw i64 %[[INDVAR]] to i32 L2_header: %i = phi i32 [ 0, %L1_header ], [ %i_next, %L2_latch ] %i_prom = sext i32 %i to i64 diff --git a/llvm/test/Transforms/IndVarSimplify/pr55925.ll b/llvm/test/Transforms/IndVarSimplify/pr55925.ll index 312a8295ccdc9f..2ad187add4e107 100644 --- a/llvm/test/Transforms/IndVarSimplify/pr55925.ll +++ b/llvm/test/Transforms/IndVarSimplify/pr55925.ll @@ -14,11 +14,11 @@ define void @test(ptr %p) personality ptr undef { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[RES:%.*]] = invoke i32 @foo(i32 returned [[TMP0]]) ; CHECK-NEXT: to label [[LOOP_LATCH]] unwind label [[EXIT:%.*]] ; CHECK: loop.latch: -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @foo(i32 [[TMP1]]) ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: br label [[LOOP]] @@ -56,8 +56,8 @@ define void @test_critedge(i1 %c, ptr %p) personality ptr undef { ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br i1 [[C:%.*]], label [[LOOP_INVOKE:%.*]], label [[LOOP_OTHER:%.*]] ; CHECK: loop.invoke: -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[RES:%.*]] = invoke i32 @foo(i32 returned [[TMP0]]) ; CHECK-NEXT: to label [[LOOP_LATCH]] unwind label [[EXIT:%.*]] ; CHECK: loop.other: diff --git a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll index d473103f5824e6..9c8983421029f5 100644 --- a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll +++ b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll @@ -223,7 +223,7 @@ define void @sext_postinc(ptr %A, i32 %start) { ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: tail call void @use_ptr(ptr [[ARRAYIDX_US]]) ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ugt i32 [[TMP1]], 6 ; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: exit.loopexit: @@ -262,7 +262,7 @@ define void @sext_preinc(ptr %A, i32 %start) { ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]] ; CHECK-NEXT: tail call void @use_ptr(ptr [[ARRAYIDX_US]]) ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ugt i32 [[TMP1]], 6 ; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: exit.loopexit: @@ -366,7 +366,7 @@ define void @zext_postinc_offset_constant_one(ptr %A, i32 %start) { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[J_016_US:%.*]] = phi i32 [ [[INC_US:%.*]], [[FOR_BODY]] ], [ [[START]], [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ADD_US:%.*]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] @@ -513,13 +513,13 @@ define void @sext_postinc_offset_constant_one(ptr %A, i32 %start) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ADD_US:%.*]] = add i32 [[TMP1]], 1 ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] ; CHECK-NEXT: tail call void @use_ptr(ptr [[ARRAYIDX_US]]) ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nsw i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6 ; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: exit.loopexit: @@ -556,13 +556,13 @@ define void @sext_preinc_offset_constant_one(ptr %A, i32 %start) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ADD_US:%.*]] = add nuw i32 [[TMP1]], 1 ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] ; CHECK-NEXT: tail call void @use_ptr(ptr [[ARRAYIDX_US]]) ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6 ; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: exit.loopexit: @@ -808,13 +808,13 @@ define void @sext_postinc_offset_constant_minus_one(ptr %A, i32 %start) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ADD_US:%.*]] = add i32 [[TMP1]], -1 ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] ; CHECK-NEXT: tail call void @use_ptr(ptr [[ARRAYIDX_US]]) ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nsw i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6 ; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: exit.loopexit: @@ -851,13 +851,13 @@ define void @sext_preinc_offset_constant_minus_one(ptr %A, i32 %start) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ADD_US:%.*]] = add i32 [[TMP1]], -1 ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] ; CHECK-NEXT: tail call void @use_ptr(ptr [[ARRAYIDX_US]]) ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6 ; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: exit.loopexit: diff --git a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll index 739db26311f4ab..e00eaafa3f1924 100644 --- a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll +++ b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll @@ -150,7 +150,7 @@ define void @sext_add_nuw(ptr %A, i32 %offset, i32 %M) { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32 ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[TMP2]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] ; CHECK-NEXT: tail call void @use_ptr(ptr [[ARRAYIDX_US]]) @@ -185,7 +185,7 @@ define void @sext_add_noflags(ptr %A, i32 %offset, i32 %M) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ADD_US:%.*]] = add i32 [[TMP0]], [[OFFSET:%.*]] ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] @@ -223,7 +223,7 @@ define void @zext_add_nsw(ptr %A, i32 %offset, i32 %M) { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nsw i64 [[TMP1]] to i32 ; CHECK-NEXT: [[IDXPROM_US:%.*]] = zext i32 [[TMP2]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] ; CHECK-NEXT: tail call void @use_ptr(ptr [[ARRAYIDX_US]]) @@ -293,7 +293,7 @@ define void @zext_add_noflags(ptr %A, i32 %offset, i32 %M) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ADD_US:%.*]] = add i32 [[TMP0]], [[OFFSET:%.*]] ; CHECK-NEXT: [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] @@ -399,7 +399,7 @@ define void @zext_nneg_add_noflags(ptr %A, i32 %offset, i32 %M) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ADD_US:%.*]] = add i32 [[TMP0]], [[OFFSET:%.*]] ; CHECK-NEXT: [[IDXPROM_US:%.*]] = zext nneg i32 [[ADD_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] @@ -475,7 +475,7 @@ define void @sext_mul_nuw(ptr %A, i32 %multiple, i32 %M) { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[INDVARS_IV]], [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32 ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[TMP2]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] ; CHECK-NEXT: tail call void @use_ptr(ptr [[ARRAYIDX_US]]) @@ -510,7 +510,7 @@ define void @sext_mul_noflags(ptr %A, i32 %multiple, i32 %M) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[MUL_US:%.*]] = mul i32 [[TMP0]], [[MULTIPLE:%.*]] ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[MUL_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] @@ -548,7 +548,7 @@ define void @zext_mul_nsw(ptr %A, i32 %multiple, i32 %M) { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i64 [[INDVARS_IV]], [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc nsw i64 [[TMP1]] to i32 ; CHECK-NEXT: [[IDXPROM_US:%.*]] = zext i32 [[TMP2]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] ; CHECK-NEXT: tail call void @use_ptr(ptr [[ARRAYIDX_US]]) @@ -618,7 +618,7 @@ define void @zext_mul_noflags(ptr %A, i32 %multiple, i32 %M) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[MUL_US:%.*]] = mul i32 [[TMP0]], [[MULTIPLE:%.*]] ; CHECK-NEXT: [[IDXPROM_US:%.*]] = zext i32 [[MUL_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] @@ -724,7 +724,7 @@ define void @zext_nneg_mul_noflags(ptr %A, i32 %multiple, i32 %M) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[MUL_US:%.*]] = mul i32 [[TMP0]], [[MULTIPLE:%.*]] ; CHECK-NEXT: [[IDXPROM_US:%.*]] = zext nneg i32 [[MUL_US]] to i64 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]] diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll index 6e6c045661c249..3ac5a69a496ffc 100644 --- a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll +++ b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll @@ -35,7 +35,7 @@ define i16 @foo() { ; CHECK-NEXT: [[SUM_110:%.*]] = phi i16 [ [[SUM_012]], [[FOR_COND1_PREHEADER]] ], [ [[ADD5]], [[FOR_BODY4]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[INDVAR]], [[TMP0]] ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i16 [[J_011]], [[MUL]] -; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i32 [[TMP2]] to i16 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i16], ptr @v, i16 0, i16 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ADD5]] = add nsw i16 [[TMP4]], [[SUM_110]] From 5f680724838188f516d349bd9459710308d721e0 Mon Sep 17 00:00:00 2001 From: Qizhi Hu <836744285@qq.com> Date: Tue, 16 Apr 2024 09:57:23 +0800 Subject: [PATCH 025/300] [Clang][Sema] Fix issue on requires expression with templated base class member function (#85198) Fix https://github.com/llvm/llvm-project/issues/84020 Skip checking implicit object parameter in the context of `RequiresExprBodyDecl`. Co-authored-by: huqizhi <836744285@qq.com> --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaExpr.cpp | 3 ++- clang/test/SemaCXX/PR84020.cpp | 23 +++++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 clang/test/SemaCXX/PR84020.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index dc108785f6cc99..76701dc723b6c3 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -538,6 +538,7 @@ Bug Fixes to C++ Support object parameter. Fixes (#GH70604), (#GH79754), (#GH84163), (#GH84425), (#GH86054), (#GH86398), and (#GH86399). - Fix a crash when deducing ``auto`` from an invalid dereference (#GH88329). +- Fix a crash in requires expression with templated base class member function. Fixes (#GH84020). Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 24f354f1c72498..189764cb4b6b08 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -7739,7 +7739,8 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl, } if (CXXMethodDecl *Method = dyn_cast_or_null(FDecl)) - if (Method->isImplicitObjectMemberFunction()) + if (!isa(CurContext) && + Method->isImplicitObjectMemberFunction()) return ExprError(Diag(LParenLoc, diag::err_member_call_without_object) << Fn->getSourceRange() << 0); diff --git a/clang/test/SemaCXX/PR84020.cpp b/clang/test/SemaCXX/PR84020.cpp new file mode 100644 index 00000000000000..8ea5dcc4527ae7 --- /dev/null +++ b/clang/test/SemaCXX/PR84020.cpp @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -std=c++20 -verify %s +// RUN: %clang_cc1 -std=c++23 -verify %s +// expected-no-diagnostics + +struct B { + template + void foo(); + + void bar(); +}; + +template +struct A : T { + auto foo() { + static_assert(requires { T::template foo(); }); + static_assert(requires { T::bar(); }); + } +}; + +int main() { + A a; + a.foo(); +} From 568368a43e5b4adb3c5d105a0eff3e0c13c0af8c Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 15 Apr 2024 19:05:30 -0700 Subject: [PATCH 026/300] [Support] Make readNext default to unaligned (#88808) Without this patch, you would typically use readNext as: readNext(Ptr) which is quite mouthful. Since most serialization/deserialization operations are unaligned accesses, this patch makes the alignment template parameter default to unaligned, allowing us to say: readNext(Ptr) I'm including a few examples of migration in this patch. I'll do the rest in a separate patch. Note that writeNext already has the same trick for the alignment template parameter. --- llvm/include/llvm/Support/Endian.h | 4 ++-- llvm/include/llvm/Support/OnDiskHashTable.h | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/Support/Endian.h b/llvm/include/llvm/Support/Endian.h index 1cdb5ca0d5eaa1..30e0852b972c5a 100644 --- a/llvm/include/llvm/Support/Endian.h +++ b/llvm/include/llvm/Support/Endian.h @@ -80,8 +80,8 @@ template return ret; } -template +template [[nodiscard]] inline value_type readNext(const CharT *&memory) { return readNext(memory, endian); } diff --git a/llvm/include/llvm/Support/OnDiskHashTable.h b/llvm/include/llvm/Support/OnDiskHashTable.h index 0a8cbbd8b18832..f6b4055e74de7e 100644 --- a/llvm/include/llvm/Support/OnDiskHashTable.h +++ b/llvm/include/llvm/Support/OnDiskHashTable.h @@ -368,14 +368,12 @@ template class OnDiskChainedHashTable { // 'Items' starts with a 16-bit unsigned integer representing the // number of items in this bucket. - unsigned Len = - endian::readNext(Items); + unsigned Len = endian::readNext(Items); for (unsigned i = 0; i < Len; ++i) { // Read the hash. hash_value_type ItemHash = - endian::readNext(Items); + endian::readNext(Items); // Determine the length of the key and the data. const std::pair &L = @@ -473,8 +471,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable { // 'Items' starts with a 16-bit unsigned integer representing the // number of items in this bucket. NumItemsInBucketLeft = - endian::readNext( - Ptr); + endian::readNext(Ptr); } Ptr += sizeof(hash_value_type); // Skip the hash. // Determine the length of the key and the data. From b4cf63d26f4c41dd9403c4e62500d82a6d31d692 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Mon, 15 Apr 2024 20:27:04 -0600 Subject: [PATCH 027/300] [X86] Remove obsolete tablegen rules for near data in small static code model (#84523) These should be already handled by other code. Removing the kernel code model rules right above it cause bss_pagealigned.ll to fail by using a movabsq to get the address of a global, haven't figured out where that code is yet. --- llvm/lib/Target/X86/X86InstrCompiler.td | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 270dd32c7235a2..2350636be10ef3 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1247,28 +1247,6 @@ def : Pat<(i64 (X86Wrapper mcsym:$dst)), def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>; -// If we have small model and -static mode, it is safe to store global addresses -// directly as immediates. FIXME: This is really a hack, the 'imm' predicate -// for MOV64mi32 should handle this sort of thing. -def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst), - (MOV64mi32 addr:$dst, tconstpool:$src)>, - Requires<[NearData, IsNotPIC]>; -def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst), - (MOV64mi32 addr:$dst, tjumptable:$src)>, - Requires<[NearData, IsNotPIC]>; -def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), - (MOV64mi32 addr:$dst, tglobaladdr:$src)>, - Requires<[NearData, IsNotPIC]>; -def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), - (MOV64mi32 addr:$dst, texternalsym:$src)>, - Requires<[NearData, IsNotPIC]>; -def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst), - (MOV64mi32 addr:$dst, mcsym:$src)>, - Requires<[NearData, IsNotPIC]>; -def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), - (MOV64mi32 addr:$dst, tblockaddress:$src)>, - Requires<[NearData, IsNotPIC]>; - def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>; def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>; From 82f479ba315a417b6cd01a8c2efdc15c26689f2e Mon Sep 17 00:00:00 2001 From: Usama Hameed Date: Mon, 15 Apr 2024 19:42:45 -0700 Subject: [PATCH 028/300] Add asan tests for libsanitizers. (#88349) This patch tests LLDB integration with libsanitizers for ASan. rdar://111856681 --- lldb/test/API/functionalities/asan/Makefile | 6 +- .../functionalities/asan/TestMemoryHistory.py | 73 ++++++++++++++++++- .../functionalities/asan/TestReportData.py | 20 ++++- .../API/functionalities/libsanitizers/util.py | 3 + 4 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 lldb/test/API/functionalities/libsanitizers/util.py diff --git a/lldb/test/API/functionalities/asan/Makefile b/lldb/test/API/functionalities/asan/Makefile index 4913a18d8cc6f9..d66696fed7078f 100644 --- a/lldb/test/API/functionalities/asan/Makefile +++ b/lldb/test/API/functionalities/asan/Makefile @@ -1,4 +1,8 @@ C_SOURCES := main.c -CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info +asan: CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info +asan: all + +libsanitizers: CFLAGS_EXTRAS := -fsanitize=address -fsanitize-stable-abi -g -gcolumn-info +libsanitizers: all include Makefile.rules diff --git a/lldb/test/API/functionalities/asan/TestMemoryHistory.py b/lldb/test/API/functionalities/asan/TestMemoryHistory.py index 00162ae8822c74..ee7939203ead18 100644 --- a/lldb/test/API/functionalities/asan/TestMemoryHistory.py +++ b/lldb/test/API/functionalities/asan/TestMemoryHistory.py @@ -9,15 +9,21 @@ from lldbsuite.test import lldbplatform from lldbsuite.test import lldbutil +from functionalities.libsanitizers.util import no_libsanitizers class AsanTestCase(TestBase): @skipIfFreeBSD # llvm.org/pr21136 runtimes not yet available by default @expectedFailureNetBSD @skipUnlessAddressSanitizer def test(self): - self.build() + self.build(make_targets=["asan"]) self.asan_tests() + @skipIf(oslist=no_match(["macosx"])) + def test_libsanitizers_asan(self): + self.build(make_targets=["libsanitizers"]) + self.libsanitizer_tests() + def setUp(self): # Call super's setUp(). TestBase.setUp(self) @@ -26,6 +32,71 @@ def setUp(self): self.line_free = line_number("main.c", "// free line") self.line_breakpoint = line_number("main.c", "// break line") + # Test line numbers: rdar://126237493 + def libsanitizer_tests(self): + target = self.createTestTarget() + + if no_libsanitizers(self): + self.skipTest("libsanitizers not found") + + self.runCmd( + "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0" + ) + + self.runCmd("run") + + # In libsanitizers, memory history is not supported until a report has been generated + self.expect( + "thread list", + "Process should be stopped due to ASan report", + substrs=["stopped", "stop reason = Use of deallocated memory"], + ) + + # test the 'memory history' command + self.expect( + "memory history 'pointer'", + substrs=[ + "Memory deallocated by Thread", + "a.out`f2", + "main.c", + "Memory allocated by Thread", + "a.out`f1", + "main.c", + ], + ) + + # do the same using SB API + process = self.dbg.GetSelectedTarget().process + val = ( + process.GetSelectedThread().GetSelectedFrame().EvaluateExpression("pointer") + ) + addr = val.GetValueAsUnsigned() + threads = process.GetHistoryThreads(addr) + self.assertEqual(threads.GetSize(), 2) + + history_thread = threads.GetThreadAtIndex(0) + self.assertTrue(history_thread.num_frames >= 2) + self.assertEqual( + history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), + "main.c", + ) + + history_thread = threads.GetThreadAtIndex(1) + self.assertTrue(history_thread.num_frames >= 2) + self.assertEqual( + history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), + "main.c", + ) + + # let's free the container (SBThreadCollection) and see if the + # SBThreads still live + threads = None + self.assertTrue(history_thread.num_frames >= 2) + self.assertEqual( + history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), + "main.c", + ) + def asan_tests(self): target = self.createTestTarget() diff --git a/lldb/test/API/functionalities/asan/TestReportData.py b/lldb/test/API/functionalities/asan/TestReportData.py index 543c5fe66a208d..de0c1206a57ad6 100644 --- a/lldb/test/API/functionalities/asan/TestReportData.py +++ b/lldb/test/API/functionalities/asan/TestReportData.py @@ -9,6 +9,7 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil +from functionalities.libsanitizers.util import no_libsanitizers class AsanTestReportDataCase(TestBase): @skipIfFreeBSD # llvm.org/pr21136 runtimes not yet available by default @@ -16,9 +17,14 @@ class AsanTestReportDataCase(TestBase): @skipUnlessAddressSanitizer @skipIf(archs=["i386"], bugnumber="llvm.org/PR36710") def test(self): - self.build() + self.build(make_targets=["asan"]) self.asan_tests() + @skipIf(oslist=no_match(["macosx"])) + def test_libsanitizers_asan(self): + self.build(make_targets=["libsanitizers"]) + self.asan_tests(libsanitizers=True) + def setUp(self): # Call super's setUp(). TestBase.setUp(self) @@ -29,10 +35,18 @@ def setUp(self): self.line_crash = line_number("main.c", "// BOOM line") self.col_crash = 16 - def asan_tests(self): + def asan_tests(self, libsanitizers=False): target = self.createTestTarget() - self.registerSanitizerLibrariesWithTarget(target) + if libsanitizers and no_libsanitizers(self): + self.skipTest("libsanitizers not found") + + if libsanitizers: + self.runCmd( + "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0" + ) + else: + self.registerSanitizerLibrariesWithTarget(target) self.runCmd("run") diff --git a/lldb/test/API/functionalities/libsanitizers/util.py b/lldb/test/API/functionalities/libsanitizers/util.py new file mode 100644 index 00000000000000..ad68541aba8d05 --- /dev/null +++ b/lldb/test/API/functionalities/libsanitizers/util.py @@ -0,0 +1,3 @@ +def no_libsanitizers(testbase): + testbase.runCmd("image list libsystem_sanitizers.dylib", check=False) + return not "libsystem_sanitizers.dylib" in testbase.res.GetOutput() From 00ae4b738be6e840cfdb3e7461409c3e49403e50 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Tue, 16 Apr 2024 03:49:28 +0000 Subject: [PATCH 029/300] Revert "[X86] Remove obsolete tablegen rules for near data in small static code model (#84523)" This reverts commit b4cf63d26f4c41dd9403c4e62500d82a6d31d692. Breaks indirect-branch-tracking-eh2.ll. --- llvm/lib/Target/X86/X86InstrCompiler.td | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 2350636be10ef3..270dd32c7235a2 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1247,6 +1247,28 @@ def : Pat<(i64 (X86Wrapper mcsym:$dst)), def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>; +// If we have small model and -static mode, it is safe to store global addresses +// directly as immediates. FIXME: This is really a hack, the 'imm' predicate +// for MOV64mi32 should handle this sort of thing. +def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tconstpool:$src)>, + Requires<[NearData, IsNotPIC]>; +def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tjumptable:$src)>, + Requires<[NearData, IsNotPIC]>; +def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tglobaladdr:$src)>, + Requires<[NearData, IsNotPIC]>; +def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, texternalsym:$src)>, + Requires<[NearData, IsNotPIC]>; +def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, mcsym:$src)>, + Requires<[NearData, IsNotPIC]>; +def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tblockaddress:$src)>, + Requires<[NearData, IsNotPIC]>; + def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>; def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>; From 8ee7d9732204a7415967bce2c1d2c0fa63af264f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Mon, 15 Apr 2024 20:51:31 -0700 Subject: [PATCH 030/300] [flang][cuda] Add fir.cuda_allocate operation (#88586) Allocatable with cuda device attribute have special semantic for the allocate statement. In flang the allocate statement is lowered to a sequence of runtime call initializing the descriptor and then allocating the descriptor data. This new operation will replace the last runtime call and abstract all the device memory allocation needed. The lowering patch will follow. --- .../include/flang/Optimizer/Dialect/FIROps.td | 32 +++++++++ .../flang/Optimizer/Dialect/FIRTypes.td | 1 + flang/lib/Optimizer/Dialect/FIROps.cpp | 19 +++++ flang/test/Fir/cuf-invalid.fir | 50 +++++++++++++ flang/test/Fir/cuf.mlir | 70 +++++++++++++++++++ 5 files changed, 172 insertions(+) create mode 100644 flang/test/Fir/cuf-invalid.fir create mode 100644 flang/test/Fir/cuf.mlir diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index dff1cdb20cbfef..c181c7ed62dff3 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -3190,4 +3190,36 @@ def fir_CUDADataTransferOp : fir_Op<"cuda_data_transfer", []> { }]; } +def fir_CUDAAllocateOp : fir_Op<"cuda_allocate", [AttrSizedOperandSegments, + MemoryEffects<[MemAlloc]>]> { + let summary = "Perform the device allocation of data of an allocatable"; + + let description = [{ + The fir.cuda_allocate operation performs the allocation on the device + of the data of an allocatable. The descriptor passed to the operation + is initialized before with the standard flang runtime calls. + }]; + + let arguments = (ins Arg:$box, + Arg, "", [MemWrite]>:$errmsg, + Optional:$stream, + Arg, "", [MemWrite]>:$pinned, + Arg, "", [MemRead]>:$source, + fir_CUDADataAttributeAttr:$cuda_attr, + UnitAttr:$hasStat); + + let results = (outs AnyIntegerType:$stat); + + let assemblyFormat = [{ + $box `:` qualified(type($box)) + ( `source` `(` $source^ `:` qualified(type($source) )`)` )? + ( `errmsg` `(` $errmsg^ `:` type($errmsg) `)` )? + ( `stream` `(` $stream^ `:` type($stream) `)` )? + ( `pinned` `(` $pinned^ `:` type($pinned) `)` )? + attr-dict `->` type($stat) + }]; + + let hasVerifier = 1; +} + #endif diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td index 4c6a8064991ab0..3b876e4642da9a 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td +++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td @@ -625,6 +625,7 @@ def AnyRefOrBoxLike : TypeConstraint, "any reference or box">; +def AnyRefOrBoxType : Type; def AnyShapeLike : TypeConstraint, "any legal shape type">; diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 8ab74103cb6a80..88710880174d21 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -3993,6 +3993,25 @@ mlir::LogicalResult fir::CUDAKernelOp::verify() { return mlir::success(); } +mlir::LogicalResult fir::CUDAAllocateOp::verify() { + if (getPinned() && getStream()) + return emitOpError("pinned and stream cannot appears at the same time"); + if (!fir::unwrapRefType(getBox().getType()).isa()) + return emitOpError( + "expect box to be a reference to/or a class or box type value"); + if (getSource() && + !fir::unwrapRefType(getSource().getType()).isa()) + return emitOpError( + "expect source to be a reference to/or a class or box type value"); + if (getErrmsg() && + !fir::unwrapRefType(getErrmsg().getType()).isa()) + return emitOpError( + "expect errmsg to be a reference to/or a box type value"); + if (getErrmsg() && !getHasStat()) + return emitOpError("expect stat attribute when errmsg is provided"); + return mlir::success(); +} + //===----------------------------------------------------------------------===// // FIROpsDialect //===----------------------------------------------------------------------===// diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir new file mode 100644 index 00000000000000..9c5ffe7176a3bd --- /dev/null +++ b/flang/test/Fir/cuf-invalid.fir @@ -0,0 +1,50 @@ +// RUN: fir-opt -split-input-file -verify-diagnostics %s + +func.func @_QPsub1() { + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} + %1 = fir.alloca i32 + %pinned = fir.alloca i1 + %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> + %s = fir.load %1 : !fir.ref + // expected-error@+1{{'fir.cuda_allocate' op pinned and stream cannot appears at the same time}} + %13 = fir.cuda_allocate %11 : !fir.ref> stream(%s : i32) pinned(%pinned : !fir.ref) {cuda_attr = #fir.cuda} -> i32 + return +} + +// ----- + +func.func @_QPsub1() { + %1 = fir.alloca i32 + // expected-error@+1{{'fir.cuda_allocate' op expect box to be a reference to/or a class or box type value}} + %2 = fir.cuda_allocate %1 : !fir.ref {cuda_attr = #fir.cuda} -> i32 + return +} + +// ----- + +func.func @_QPsub1() { + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} + %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %c100 = arith.constant 100 : index + %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"} + %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) + %9 = fir.embox %8#1 : (!fir.ref>) -> !fir.box> + %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> + %16 = fir.convert %9 : (!fir.box>) -> !fir.box + // expected-error@+1{{'fir.cuda_allocate' op expect stat attribute when errmsg is provided}} + %13 = fir.cuda_allocate %11 : !fir.ref> errmsg(%16 : !fir.box) {cuda_attr = #fir.cuda} -> i32 + return +} + +// ----- + +func.func @_QPsub1() { + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} + %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %1 = fir.alloca i32 + %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> + // expected-error@+1{{'fir.cuda_allocate' op expect errmsg to be a reference to/or a box type value}} + %13 = fir.cuda_allocate %11 : !fir.ref> errmsg(%1 : !fir.ref) {cuda_attr = #fir.cuda, hasStat} -> i32 + return +} diff --git a/flang/test/Fir/cuf.mlir b/flang/test/Fir/cuf.mlir new file mode 100644 index 00000000000000..67eff31b35b2b8 --- /dev/null +++ b/flang/test/Fir/cuf.mlir @@ -0,0 +1,70 @@ +// RUN: fir-opt --split-input-file %s | fir-opt --split-input-file | FileCheck %s + +// Simple round trip test of operations. + +func.func @_QPsub1() { + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} + %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> + %13 = fir.cuda_allocate %11 : !fir.ref> {cuda_attr = #fir.cuda} -> i32 + return +} + +// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref> {cuda_attr = #fir.cuda} -> i32 + +// ----- + +func.func @_QPsub1() { + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} + %1 = fir.alloca i32 + %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> + %s = fir.load %1 : !fir.ref + %13 = fir.cuda_allocate %11 : !fir.ref> stream(%s : i32) {cuda_attr = #fir.cuda} -> i32 + return +} + +// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref> stream(%{{.*}} : i32) {cuda_attr = #fir.cuda} -> i32 + +// ----- + +func.func @_QPsub1() { + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} + %1 = fir.alloca !fir.box>> {bindc_name = "b", uniq_name = "_QFsub1Eb"} + %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %5:2 = hlfir.declare %1 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> + %12 = fir.convert %5#1 : (!fir.ref>>>) -> !fir.ref> + %13 = fir.cuda_allocate %11 : !fir.ref> source(%12 : !fir.ref>) {cuda_attr = #fir.cuda} -> i32 + return +} + +// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref> source(%{{.*}} : !fir.ref>) {cuda_attr = #fir.cuda} -> i32 + +// ----- + +func.func @_QPsub1() { + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} + %pinned = fir.alloca i1 + %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> + %13 = fir.cuda_allocate %11 : !fir.ref> pinned(%pinned : !fir.ref) {cuda_attr = #fir.cuda} -> i32 + return +} + +// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref> pinned(%{{.*}} : !fir.ref) {cuda_attr = #fir.cuda} -> i32 + +// ----- + +func.func @_QPsub1() { + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} + %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %c100 = arith.constant 100 : index + %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"} + %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) + %9 = fir.embox %8#1 : (!fir.ref>) -> !fir.box> + %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> + %16 = fir.convert %9 : (!fir.box>) -> !fir.box + %13 = fir.cuda_allocate %11 : !fir.ref> errmsg(%16 : !fir.box) {cuda_attr = #fir.cuda, hasStat} -> i32 + return +} From fe48bf672e1ab293368a3212203db94a4e21c533 Mon Sep 17 00:00:00 2001 From: Chelsea Cassanova Date: Mon, 15 Apr 2024 21:26:18 -0700 Subject: [PATCH 031/300] [lldb][lit] Guard MallocNanoZone envvar in shell tests (#88824) Previously the MallocNanoZone envvar would be set to 0 on Darwin for the LLDB shell tests, but this should guarded behind ASan being enabled as opposed to simply running the test suite behind Darwin. This required that the LLVM_USE_SANITIZER option be added as an attribute to the lit config for shell tests. --- lldb/test/Shell/lit.cfg.py | 12 ++++++++---- lldb/test/Shell/lit.site.cfg.py.in | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py index 290569576ac80d..e24f3fbb4d9318 100644 --- a/lldb/test/Shell/lit.cfg.py +++ b/lldb/test/Shell/lit.cfg.py @@ -50,10 +50,14 @@ ) # Enable sanitizer runtime flags. -config.environment["ASAN_OPTIONS"] = "detect_stack_use_after_return=1" -config.environment["TSAN_OPTIONS"] = "halt_on_error=1" -if platform.system() == "Darwin": - config.environment["MallocNanoZone"] = "0" +if "Address" in config.llvm_use_sanitizer: + config.environment["ASAN_OPTIONS"] = "detect_stack_use_after_return=1" + if platform.system() == "Darwin": + config.environment["MallocNanoZone"] = "0" + +if "Thread" in config.llvm_use_sanitizer: + config.environment["TSAN_OPTIONS"] = "halt_on_error=1" + # Support running the test suite under the lldb-repro wrapper. This makes it # possible to capture a test suite run and then rerun all the test from the diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in index 736dfc335732b5..b69e7bce1bc0be 100644 --- a/lldb/test/Shell/lit.site.cfg.py.in +++ b/lldb/test/Shell/lit.site.cfg.py.in @@ -26,6 +26,7 @@ config.lldb_enable_lua = @LLDB_ENABLE_LUA@ config.lldb_build_directory = "@LLDB_TEST_BUILD_DIRECTORY@" config.have_lldb_server = @LLDB_TOOL_LLDB_SERVER_BUILD@ config.lldb_system_debugserver = @LLDB_USE_SYSTEM_DEBUGSERVER@ +config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@" # The shell tests use their own module caches. config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell") config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell") From 65b0cc610f80d9b9724a98cf7c5bcfd38e1cf799 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Apr 2024 21:54:26 -0700 Subject: [PATCH 032/300] [RISCV] Add FeatureStdExtI to all CPUs in RISCVProcessors.td. NFC (#88805) This is currently being implied in RISCVISAInfo.cpp. Make it explicit. I'm planning to move all extension information to RISCVFeatures.td and have tablegen create the tables for RISCVISAInfo.cpp. This requires making the creation of RISCVTargetParserDef.inc in tablegen independent of RISCVISAInfo.cpp. So we need an accurate extension list for CPUs in tablegen. --- llvm/lib/Target/RISCV/RISCVProcessors.td | 27 ++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index fd6d6078ec238b..739b50749e1323 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -56,11 +56,13 @@ class RISCVTuneProcessorModel, + [Feature32Bit, + FeatureStdExtI]>, GenericTuneInfo; def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64", NoSchedModel, - [Feature64Bit]>, + [Feature64Bit, + FeatureStdExtI]>, GenericTuneInfo; // Support generic for compatibility with other targets. The triple will be used // to change to the appropriate rv32/rv64 version. @@ -69,11 +71,13 @@ def : ProcessorModel<"generic", NoSchedModel, []>, GenericTuneInfo; def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32", RocketModel, [Feature32Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtZicsr]>; def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64", RocketModel, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtZicsr]>; def ROCKET : RISCVTuneProcessorModel<"rocket", @@ -86,6 +90,7 @@ def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series", def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20", RocketModel, [Feature32Bit, + FeatureStdExtI, FeatureStdExtZicsr, FeatureStdExtZifencei, FeatureStdExtM, @@ -94,6 +99,7 @@ def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20", def SIFIVE_E21 : RISCVProcessorModel<"sifive-e21", RocketModel, [Feature32Bit, + FeatureStdExtI, FeatureStdExtZicsr, FeatureStdExtZifencei, FeatureStdExtM, @@ -103,6 +109,7 @@ def SIFIVE_E21 : RISCVProcessorModel<"sifive-e21", def SIFIVE_E24 : RISCVProcessorModel<"sifive-e24", RocketModel, [Feature32Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtM, FeatureStdExtA, @@ -112,6 +119,7 @@ def SIFIVE_E24 : RISCVProcessorModel<"sifive-e24", def SIFIVE_E31 : RISCVProcessorModel<"sifive-e31", RocketModel, [Feature32Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtZicsr, FeatureStdExtM, @@ -121,6 +129,7 @@ def SIFIVE_E31 : RISCVProcessorModel<"sifive-e31", def SIFIVE_E34 : RISCVProcessorModel<"sifive-e34", RocketModel, [Feature32Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtM, FeatureStdExtA, @@ -130,6 +139,7 @@ def SIFIVE_E34 : RISCVProcessorModel<"sifive-e34", def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76", SiFive7Model, [Feature32Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtM, FeatureStdExtA, @@ -140,6 +150,7 @@ def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76", def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21", RocketModel, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZicsr, FeatureStdExtZifencei, FeatureStdExtM, @@ -149,6 +160,7 @@ def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21", def SIFIVE_S51 : RISCVProcessorModel<"sifive-s51", RocketModel, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZicsr, FeatureStdExtZifencei, FeatureStdExtM, @@ -158,6 +170,7 @@ def SIFIVE_S51 : RISCVProcessorModel<"sifive-s51", def SIFIVE_S54 : RISCVProcessorModel<"sifive-s54", RocketModel, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtM, FeatureStdExtA, @@ -168,6 +181,7 @@ def SIFIVE_S54 : RISCVProcessorModel<"sifive-s54", def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76", SiFive7Model, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtM, FeatureStdExtA, @@ -180,6 +194,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76", def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54", RocketModel, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtM, FeatureStdExtA, @@ -190,6 +205,7 @@ def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54", def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74", SiFive7Model, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtM, FeatureStdExtA, @@ -200,6 +216,7 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74", def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtM, FeatureStdExtA, @@ -217,6 +234,7 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model, def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtM, FeatureStdExtA, @@ -247,6 +265,7 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model, def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtM, FeatureStdExtA, @@ -286,6 +305,7 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model, def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base", SyntacoreSCR1Model, [Feature32Bit, + FeatureStdExtI, FeatureStdExtZicsr, FeatureStdExtZifencei, FeatureStdExtC], @@ -294,6 +314,7 @@ def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base", def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max", SyntacoreSCR1Model, [Feature32Bit, + FeatureStdExtI, FeatureStdExtZicsr, FeatureStdExtZifencei, FeatureStdExtM, @@ -303,6 +324,7 @@ def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max", def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1", NoSchedModel, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZifencei, FeatureStdExtZicsr, FeatureStdExtZicntr, @@ -332,6 +354,7 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1", def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu", XiangShanNanHuModel, [Feature64Bit, + FeatureStdExtI, FeatureStdExtZicsr, FeatureStdExtZifencei, FeatureStdExtM, From 4e2d11f87a595d658f25ec5838b002e4bf5a26fc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Apr 2024 21:55:50 -0700 Subject: [PATCH 033/300] [RISCV] Remove unnecessary NoHasStdExtZicfiss Predicate from cmop.1 and cmop.5. NFC (#88822) I'm not sure what this was supposed do. Maybe it was for the disassembler, but that should be managed through DecoderNamespace. So let's remove it and simplify the code. --- llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td index dd13a07d606d04..65091aad91003d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td @@ -20,13 +20,7 @@ class CMOPInst imm3, string opcodestr> let Inst{12-11} = 0; } -// CMOP1, CMOP5 is used by Zicfiss. -let Predicates = [HasStdExtZcmop, NoHasStdExtZicfiss] in { - def CMOP1 : CMOPInst<0, "cmop.1">, Sched<[]>; - def CMOP5 : CMOPInst<2, "cmop.5">, Sched<[]>; -} - -foreach n = [3, 7, 9, 11, 13, 15] in { +foreach n = [1, 3, 5, 7, 9, 11, 13, 15] in { let Predicates = [HasStdExtZcmop] in def CMOP # n : CMOPInst, Sched<[]>; } From edb0708dc1ceeaeb3356311a4ddf72a0dc9b224f Mon Sep 17 00:00:00 2001 From: Vlad Mishel <43666597+vmishelcs@users.noreply.github.com> Date: Mon, 15 Apr 2024 22:05:29 -0700 Subject: [PATCH 034/300] [InstCombine] Implement `fcmp (fadd x, 0.0), y` => `fcmp x, y` optimization (#88476) This PR addresses issue #88168. It implements an optimization for the case of ``` define i1 @fcmp_fadd_zero_ugt(float %x, float %y) { %add = fadd float %x, 0.000000e+00 %cmp = fcmp ugt float %add, %y ret i1 %cmp } ``` `=>` ``` define i1 @fcmp_fadd_zero_ugt(float %x, float %y) { %cmp = fcmp ugt float %x, %y ret i1 %cmp } ``` and all other types of `fcmp` instructions (`uge`, `ogt`, etc). Proofs: `fadd x, 0.0` https://alive2.llvm.org/ce/z/7FzNnM `fsub x, 0.0` https://alive2.llvm.org/ce/z/puUxLK --- .../InstCombine/InstCombineCompares.cpp | 8 + llvm/test/Transforms/InstCombine/fcmp.ll | 202 ++++++++++++++++++ 2 files changed, 210 insertions(+) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 90550cdbdf8911..ee783eed190a7c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -8097,6 +8097,14 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) { return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I); } + // fcmp (fadd X, 0.0), Y --> fcmp X, Y + if (match(Op0, m_FAdd(m_Value(X), m_AnyZeroFP()))) + return new FCmpInst(Pred, X, Op1, "", &I); + + // fcmp X, (fadd Y, 0.0) --> fcmp X, Y + if (match(Op1, m_FAdd(m_Value(Y), m_AnyZeroFP()))) + return new FCmpInst(Pred, Op0, Y, "", &I); + if (match(Op0, m_FPExt(m_Value(X)))) { // fcmp (fpext X), (fpext Y) -> fcmp X, Y if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType()) diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll index f2701d16d0f3d1..069512b0f2d8eb 100644 --- a/llvm/test/Transforms/InstCombine/fcmp.ll +++ b/llvm/test/Transforms/InstCombine/fcmp.ll @@ -1284,3 +1284,205 @@ define <1 x i1> @bitcast_1vec_eq0(i32 %x) { %cmp = fcmp oeq <1 x float> %f, zeroinitializer ret <1 x i1> %cmp } + +; Simplify fcmp (x + 0.0), y => fcmp x, y + +define i1 @fcmp_fadd_zero_ugt(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_ugt( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ugt float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp ugt float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_uge(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_uge( +; CHECK-NEXT: [[CMP:%.*]] = fcmp uge float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp uge float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_ogt(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_ogt( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp ogt float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_oge(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_oge( +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp oge float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_ult(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_ult( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ult float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp ult float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_ule(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_ule( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ule float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp ule float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_olt(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_olt( +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp olt float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_ole(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_ole( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ole float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp ole float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_oeq(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_oeq( +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp oeq float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_one(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_one( +; CHECK-NEXT: [[CMP:%.*]] = fcmp one float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp one float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_ueq(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_ueq( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ueq float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp ueq float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_une(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_une( +; CHECK-NEXT: [[CMP:%.*]] = fcmp une float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp une float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_ord(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_ord( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ord float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp ord float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_uno(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_uno( +; CHECK-NEXT: [[CMP:%.*]] = fcmp uno float [[ADD:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp uno float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_neg_zero(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_neg_zero( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ugt float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, -0.000000e+00 + %cmp = fcmp ugt float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_zero_switched(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_zero_switched( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ult float [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %y, 0.000000e+00 + %cmp = fcmp ugt float %x, %add + ret i1 %cmp +} + +define <2 x i1> @fcmp_fadd_zero_vec(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @fcmp_fadd_zero_vec( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ugt <2 x float> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret <2 x i1> [[CMP]] +; + %add = fadd <2 x float> %x, + %cmp = fcmp ugt <2 x float> %add, %y + ret <2 x i1> %cmp +} + +define i1 @fcmp_fast_fadd_fast_zero(float %x, float %y) { +; CHECK-LABEL: @fcmp_fast_fadd_fast_zero( +; CHECK-NEXT: [[CMP:%.*]] = fcmp fast ugt float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd fast float %x, 0.000000e+00 + %cmp = fcmp fast ugt float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fast_fadd_zero(float %x, float %y) { +; CHECK-LABEL: @fcmp_fast_fadd_zero( +; CHECK-NEXT: [[CMP:%.*]] = fcmp fast ugt float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd float %x, 0.000000e+00 + %cmp = fcmp fast ugt float %add, %y + ret i1 %cmp +} + +define i1 @fcmp_fadd_fast_zero(float %x, float %y) { +; CHECK-LABEL: @fcmp_fadd_fast_zero( +; CHECK-NEXT: [[CMP:%.*]] = fcmp ugt float [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i1 [[CMP]] +; + %add = fadd fast float %x, 0.000000e+00 + %cmp = fcmp ugt float %add, %y + ret i1 %cmp +} From 39016e33b0fe78ddb1f11822f71a8a233af4dca9 Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Tue, 16 Apr 2024 13:00:06 +0800 Subject: [PATCH 035/300] [C++20] [Modules] Don't import non-inline function bodies even if it is always-inline Recommit https://github.com/llvm/llvm-project/commit/1ecbab56dcbb78268c8d19af34a50591f90b12a0 Close https://github.com/llvm/llvm-project/issues/80949 The new thing in this commit is to allow to import the function body from instantiations if it is marked with always-inline. See the discussion in https://github.com/llvm/llvm-project/issues/86893 for details. --- clang/lib/CodeGen/CodeGenModule.cpp | 17 ++++++++-- .../CodeGenCXX/module-funcs-from-imports.cppm | 31 ++++++++++++++++--- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index e44749672d5827..0c447b20cef40d 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -3952,9 +3952,20 @@ bool CodeGenModule::shouldEmitFunction(GlobalDecl GD) { // behavior may break ABI compatibility of the current unit. if (const Module *M = F->getOwningModule(); M && M->getTopLevelModule()->isNamedModule() && - getContext().getCurrentNamedModule() != M->getTopLevelModule() && - !F->hasAttr()) - return false; + getContext().getCurrentNamedModule() != M->getTopLevelModule()) { + // There are practices to mark template member function as always-inline + // and mark the template as extern explicit instantiation but not give + // the definition for member function. So we have to emit the function + // from explicitly instantiation with always-inline. + // + // See https://github.com/llvm/llvm-project/issues/86893 for details. + // + // TODO: Maybe it is better to give it a warning if we call a non-inline + // function from other module units which is marked as always-inline. + if (!F->isTemplateInstantiation() || !F->hasAttr()) { + return false; + } + } if (F->hasAttr()) return false; diff --git a/clang/test/CodeGenCXX/module-funcs-from-imports.cppm b/clang/test/CodeGenCXX/module-funcs-from-imports.cppm index 33cdf437110a9e..a2a9122fc39130 100644 --- a/clang/test/CodeGenCXX/module-funcs-from-imports.cppm +++ b/clang/test/CodeGenCXX/module-funcs-from-imports.cppm @@ -23,6 +23,21 @@ int func_in_gmf_not_called() { return 44; } +template +class A { +public: + __attribute__((always_inline)) + inline constexpr int getValue() { + return 43; + } + + inline constexpr int getValue2() { + return 43; + } +}; + +extern template class A; + //--- M.cppm module; #include "foo.h" @@ -47,17 +62,21 @@ int always_inline_func() { return 45; } +export using ::A; + //--- Use.cpp import M; int use() { - return exported_func() + always_inline_func(); + A a; + return exported_func() + always_inline_func() + + a.getValue() + a.getValue2(); } -// Checks that none of the function (except the always_inline_func) in the importees -// are generated in the importer's code. // CHECK-O0: define{{.*}}_Z3usev( // CHECK-O0: declare{{.*}}_ZW1M13exported_funcv( -// CHECK-O0: define{{.*}}available_externally{{.*}}_ZW1M18always_inline_funcv( +// CHECK-O0: declare{{.*}}_ZW1M18always_inline_funcv( +// CHECK-O0: define{{.*}}@_ZN1AIcE8getValueEv( +// CHECK-O0: declare{{.*}}@_ZN1AIcE9getValue2Ev( // CHECK-O0-NOT: func_in_gmf // CHECK-O0-NOT: func_in_gmf_not_called // CHECK-O0-NOT: non_exported_func @@ -68,7 +87,9 @@ int use() { // O0 to keep consistent ABI. // CHECK-O1: define{{.*}}_Z3usev( // CHECK-O1: declare{{.*}}_ZW1M13exported_funcv( -// CHECK-O1: define{{.*}}available_externally{{.*}}_ZW1M18always_inline_funcv( +// CHECK-O1: declare{{.*}}_ZW1M18always_inline_funcv( +// CHECK-O1: define{{.*}}@_ZN1AIcE8getValueEv( +// CHECK-O1: declare{{.*}}@_ZN1AIcE9getValue2Ev( // CHECK-O1-NOT: func_in_gmf // CHECK-O1-NOT: func_in_gmf_not_called // CHECK-O1-NOT: non_exported_func From f14146fd46dd8bf7ef1e64d3c13af73dc5668045 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Apr 2024 22:33:14 -0700 Subject: [PATCH 036/300] [RISCV] Correct the spelling of the Zcmop mnemonics. (#88826) The instruction names should be c.mop.1 instead of cmop.1. --- llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td | 2 +- llvm/test/MC/RISCV/rv32zcmop-invalid.s | 6 ++-- llvm/test/MC/RISCV/rvzcmop-valid.s | 32 ++++++++++---------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td index 65091aad91003d..32e7f962aa2ab0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td @@ -22,5 +22,5 @@ class CMOPInst imm3, string opcodestr> foreach n = [1, 3, 5, 7, 9, 11, 13, 15] in { let Predicates = [HasStdExtZcmop] in - def CMOP # n : CMOPInst, Sched<[]>; + def C_MOP # n : CMOPInst, Sched<[]>; } diff --git a/llvm/test/MC/RISCV/rv32zcmop-invalid.s b/llvm/test/MC/RISCV/rv32zcmop-invalid.s index 71d72d59b02092..fb6252f7f0760c 100644 --- a/llvm/test/MC/RISCV/rv32zcmop-invalid.s +++ b/llvm/test/MC/RISCV/rv32zcmop-invalid.s @@ -1,7 +1,7 @@ # RUN: not llvm-mc -triple riscv32 -mattr=+zcmop < %s 2>&1 | FileCheck %s -cmop.0 # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic +c.mop.0 # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic -cmop.1 t0 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction +c.mop.1 t0 # CHECK: :[[@LINE]]:9: error: invalid operand for instruction -cmop.1 0x0 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction +c.mop.1 0x0 # CHECK: :[[@LINE]]:9: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/rvzcmop-valid.s b/llvm/test/MC/RISCV/rvzcmop-valid.s index c6bb4a15808258..dd5d26ac5dd0cd 100644 --- a/llvm/test/MC/RISCV/rvzcmop-valid.s +++ b/llvm/test/MC/RISCV/rvzcmop-valid.s @@ -9,34 +9,34 @@ # RUN: | llvm-objdump --mattr=+zcmop -d -r - \ # RUN: | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s -# CHECK-ASM-AND-OBJ: cmop.1 +# CHECK-ASM-AND-OBJ: c.mop.1 # CHECK-ASM: encoding: [0x81,0x60] -cmop.1 +c.mop.1 -# CHECK-ASM-AND-OBJ: cmop.3 +# CHECK-ASM-AND-OBJ: c.mop.3 # CHECK-ASM: encoding: [0x81,0x61] -cmop.3 +c.mop.3 -# CHECK-ASM-AND-OBJ: cmop.5 +# CHECK-ASM-AND-OBJ: c.mop.5 # CHECK-ASM: encoding: [0x81,0x62] -cmop.5 +c.mop.5 -# CHECK-ASM-AND-OBJ: cmop.7 +# CHECK-ASM-AND-OBJ: c.mop.7 # CHECK-ASM: encoding: [0x81,0x63] -cmop.7 +c.mop.7 -# CHECK-ASM-AND-OBJ: cmop.9 +# CHECK-ASM-AND-OBJ: c.mop.9 # CHECK-ASM: encoding: [0x81,0x64] -cmop.9 +c.mop.9 -# CHECK-ASM-AND-OBJ: cmop.11 +# CHECK-ASM-AND-OBJ: c.mop.11 # CHECK-ASM: encoding: [0x81,0x65] -cmop.11 +c.mop.11 -# CHECK-ASM-AND-OBJ: cmop.13 +# CHECK-ASM-AND-OBJ: c.mop.13 # CHECK-ASM: encoding: [0x81,0x66] -cmop.13 +c.mop.13 -# CHECK-ASM-AND-OBJ: cmop.15 +# CHECK-ASM-AND-OBJ: c.mop.15 # CHECK-ASM: encoding: [0x81,0x67] -cmop.15 +c.mop.15 From 17d6bf046cea381413895f91e24d26d65763b59a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Apr 2024 22:39:09 -0700 Subject: [PATCH 037/300] [RISCV] Change how MMO is rebuilt in lowerFixedLengthVectorLoadToRVV/lowerFixedLengthVectorStoreToRVV (#88811) Copy the pointer info, flags, alignment, AAInfo, and ranges, but let getLoad rebuild the MMO using the scalable type used for the the new load/store. This makes sure the LLT minimum size matches the ContainerVT minimum size. This is important since vscale_range may have been used to determine that the fixed vector was the exact size of a scalable vector. Fixes #88799 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 19 +++++-------------- llvm/test/CodeGen/RISCV/rvv/pr88799.ll | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 14 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/pr88799.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f6ed6420c9e1fa..1d1ea6bae6c105 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -10433,14 +10433,10 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op, if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() && getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) { MachineMemOperand *MMO = Load->getMemOperand(); - MachineFunction &MF = DAG.getMachineFunction(); - MMO = MF.getMachineMemOperand( - MMO, MMO->getPointerInfo(), - MMO->getMemoryType().isValid() - ? LLT::scalable_vector(1, MMO->getMemoryType().getSizeInBits()) - : MMO->getMemoryType()); SDValue NewLoad = - DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(), MMO); + DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(), + MMO->getPointerInfo(), MMO->getBaseAlign(), MMO->getFlags(), + MMO->getAAInfo(), MMO->getRanges()); SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget); return DAG.getMergeValues({Result, NewLoad.getValue(1)}, DL); } @@ -10500,14 +10496,9 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op, if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() && getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) { MachineMemOperand *MMO = Store->getMemOperand(); - MachineFunction &MF = DAG.getMachineFunction(); - MMO = MF.getMachineMemOperand( - MMO, MMO->getPointerInfo(), - MMO->getMemoryType().isValid() - ? LLT::scalable_vector(1, MMO->getMemoryType().getSizeInBits()) - : MMO->getMemoryType()); return DAG.getStore(Store->getChain(), DL, NewValue, Store->getBasePtr(), - MMO); + MMO->getPointerInfo(), MMO->getBaseAlign(), + MMO->getFlags(), MMO->getAAInfo()); } SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG, diff --git a/llvm/test/CodeGen/RISCV/rvv/pr88799.ll b/llvm/test/CodeGen/RISCV/rvv/pr88799.ll new file mode 100644 index 00000000000000..7212a789f9e7e0 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/pr88799.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=riscv64-unknown-linux-gnu -mattr=+v | FileCheck %s + +define i32 @main() vscale_range(2,2) { +; CHECK-LABEL: main: +; CHECK: # %bb.0: # %vector.body +; CHECK-NEXT: lui a0, 1040368 +; CHECK-NEXT: addiw a0, a0, -144 +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: vs2r.v v8, (zero) +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: ret +vector.body: + %0 = load <16 x i16>, ptr getelementptr ([3 x [23 x [23 x i16]]], ptr null, i64 -10593, i64 1, i64 22, i64 0), align 16 + store <16 x i16> %0, ptr null, align 2 + %wide.load = load , ptr getelementptr ([3 x [23 x [23 x i16]]], ptr null, i64 -10593, i64 1, i64 22, i64 0), align 16 + store %wide.load, ptr null, align 2 + ret i32 0 +} From 6da1966bc503e1ce44ef36e7107c9db482fac6ab Mon Sep 17 00:00:00 2001 From: Michal Terepeta Date: Tue, 16 Apr 2024 07:55:45 +0200 Subject: [PATCH 038/300] [RISCV] Add scheduling information for SiFive VCIX (#86093) This adds `RISCVScheduleXSf.td` with `SchedWrite` definitions for all VCIX instructions and uses it in `RISCVSchedSiFive7.td` to set default latencies for these instructions, helping with issue https://github.com/llvm/llvm-project/issues/83391. Of course these default latencies cannot be accurate (since each coprocessor will have different latencies), but this seems to be enough to avoid some of the problematic behavior described in the bug. In any case, this seems to be enough to help with #83391 in our internal testing. A subsequent discussion is how to structure the code such that it's easier for downstream consumers of this to use `SiFive7` scheduling model with accurate VCIX latencies. But we can probably have a separate issue to discuss that. --- llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td | 48 +++++++++++---- llvm/lib/Target/RISCV/RISCVSchedRocket.td | 1 + llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 48 +++++++++++++++ llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td | 1 + llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td | 1 + .../Target/RISCV/RISCVSchedSyntacoreSCR1.td | 1 + .../Target/RISCV/RISCVSchedXiangShanNanHu.td | 1 + llvm/lib/Target/RISCV/RISCVSchedule.td | 1 + llvm/lib/Target/RISCV/RISCVScheduleXSf.td | 59 +++++++++++++++++++ 9 files changed, 149 insertions(+), 12 deletions(-) create mode 100644 llvm/lib/Target/RISCV/RISCVScheduleXSf.td diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td index 9a6818c99af206..71aa1f19e089a9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td @@ -307,10 +307,16 @@ multiclass VPseudoVC_X { let VLMul = m.value in { let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in { - def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_X; - def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_X; + def "PseudoVC_" # NAME # "_SE_" # m.MX + : VPseudoVC_X, + Sched<[!cast("WriteVC_" # NAME # "_" # m.MX)]>; + def "PseudoVC_V_" # NAME # "_SE_" # m.MX + : VPseudoVC_V_X, + Sched<[!cast("WriteVC_V_" # NAME # "_" # m.MX)]>; } - def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_X; + def "PseudoVC_V_" # NAME # "_" # m.MX + : VPseudoVC_V_X, + Sched<[!cast("WriteVC_V_" # NAME # "_" # m.MX)]>; } } @@ -318,10 +324,16 @@ multiclass VPseudoVC_XV { let VLMul = m.value in { let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in { - def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XV; - def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XV; + def "PseudoVC_" # NAME # "_SE_" # m.MX + : VPseudoVC_XV, + Sched<[!cast("WriteVC_" # NAME # "_" # m.MX)]>; + def "PseudoVC_V_" # NAME # "_SE_" # m.MX + : VPseudoVC_V_XV, + Sched<[!cast("WriteVC_V_" # NAME # "_" # m.MX)]>; } - def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XV; + def "PseudoVC_V_" # NAME # "_" # m.MX + : VPseudoVC_V_XV, + Sched<[!cast("WriteVC_V_" # NAME # "_" # m.MX)]>; } } @@ -329,10 +341,16 @@ multiclass VPseudoVC_XVV { let VLMul = m.value in { let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in { - def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV; - def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV; + def "PseudoVC_" # NAME # "_SE_" # m.MX + : VPseudoVC_XVV, + Sched<[!cast("WriteVC_" # NAME # "_" # m.MX)]>; + def "PseudoVC_V_" # NAME # "_SE_" # m.MX + : VPseudoVC_V_XVV, + Sched<[!cast("WriteVC_V_" # NAME # "_" # m.MX)]>; } - def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV; + def "PseudoVC_V_" # NAME # "_" # m.MX + : VPseudoVC_V_XVV, + Sched<[!cast("WriteVC_V_" # NAME # "_" # m.MX)]>; } } @@ -340,11 +358,17 @@ multiclass VPseudoVC_XVW { let VLMul = m.value in { let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in - def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV; + def "PseudoVC_" # NAME # "_SE_" # m.MX + : VPseudoVC_XVV, + Sched<[!cast("WriteVC_" # NAME # "_" # m.MX)]>; let Constraints = "@earlyclobber $rd, $rd = $rs3" in { let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in - def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV; - def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV; + def "PseudoVC_V_" # NAME # "_SE_" # m.MX + : VPseudoVC_V_XVV, + Sched<[!cast("WriteVC_V_" # NAME # "_" # m.MX)]>; + def "PseudoVC_V_" # NAME # "_" # m.MX + : VPseudoVC_V_XVV, + Sched<[!cast("WriteVC_V_" # NAME # "_" # m.MX)]>; } } } diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td index e74c7aab7474da..65494e73758d63 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td +++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td @@ -261,4 +261,5 @@ defm : UnsupportedSchedZbkx; defm : UnsupportedSchedZfa; defm : UnsupportedSchedZfh; defm : UnsupportedSchedSFB; +defm : UnsupportedSchedXsfvcp; } diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 4dcec96df9d576..a532066b3a1c83 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -962,6 +962,54 @@ let Latency = 3 in def : InstRW<[WriteIALU], (instrs COPY)>; +// VCIX +// +// In principle we don't know the latency of any VCIX instructions. But instead +// of taking the default of 1, which can lead to issues [1], we assume that they +// have a fairly high latency. +// +// [1] https://github.com/llvm/llvm-project/issues/83391 +foreach mx = SchedMxList in { + defvar Cycles = SiFive7GetCyclesDefault.c; + defvar IsWorstCase = SiFive7IsWorstCaseMX.c; + let Latency = !mul(Cycles, 10), + AcquireAtCycles = [0, 1], + ReleaseAtCycles = [1, !add(1, Cycles)] in { + defm "" : LMULWriteResMX<"WriteVC_V_I", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_X", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_IV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_XV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_IVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_IVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_VVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_VVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_XVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_XVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + foreach f = ["FPR16", "FPR32", "FPR64"] in { + defm "" : LMULWriteResMX<"WriteVC_V_" # f # "V", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_" # f # "VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_V_" # f # "VW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + } + defm "" : LMULWriteResMX<"WriteVC_I", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_X", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_IV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_XV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_IVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_IVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_VVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_VVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_XVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_XVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + foreach f = ["FPR16", "FPR32", "FPR64"] in { + defm "" : LMULWriteResMX<"WriteVC_" # f # "V", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_" # f # "VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVC_" # f # "VW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>; + } + } +} + //===----------------------------------------------------------------------===// // Bypass and advance diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td index 8ec2e4ff885ebb..fccdd7e4f3ec2e 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td @@ -366,4 +366,5 @@ defm : UnsupportedSchedZbkx; defm : UnsupportedSchedSFB; defm : UnsupportedSchedZfa; defm : UnsupportedSchedV; +defm : UnsupportedSchedXsfvcp; } diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td index 0ced2efa3f7abf..6e4fb19361f553 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td @@ -1040,4 +1040,5 @@ defm : UnsupportedSchedZbkb; defm : UnsupportedSchedZbkx; defm : UnsupportedSchedSFB; defm : UnsupportedSchedZfa; +defm : UnsupportedSchedXsfvcp; } diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td index 9625d17e0b2600..0885e325f24e68 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td @@ -212,4 +212,5 @@ defm : UnsupportedSchedZbkb; defm : UnsupportedSchedZbkx; defm : UnsupportedSchedZfa; defm : UnsupportedSchedZfh; +defm : UnsupportedSchedXsfvcp; } diff --git a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td index 4fc7b0335af538..e0f1fab1d6b409 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td +++ b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td @@ -311,4 +311,5 @@ defm : UnsupportedSchedZfa; defm : UnsupportedSchedZfh; defm : UnsupportedSchedSFB; defm : UnsupportedSchedZabha; +defm : UnsupportedSchedXsfvcp; } diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td index 1d19624342d2bb..0086557a41fe7c 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedule.td +++ b/llvm/lib/Target/RISCV/RISCVSchedule.td @@ -296,3 +296,4 @@ def : ReadAdvance; // Include the scheduler resources for other instruction extensions. include "RISCVScheduleZb.td" include "RISCVScheduleV.td" +include "RISCVScheduleXSf.td" diff --git a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td new file mode 100644 index 00000000000000..58d508460f0190 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td @@ -0,0 +1,59 @@ +//===-- RISCVScheduleXSf.td - Scheduling Definitions XSf ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the scheduling information for SiFive extensions. +// +//===----------------------------------------------------------------------===// + +multiclass LMULSchedWritesVCIX{ +defm "" : LMULSchedWrites<"WriteVC_" # id>; +defm "" : LMULSchedWrites<"WriteVC_V_" # id>; +} + +defm "" : LMULSchedWritesVCIX<"I">; +defm "" : LMULSchedWritesVCIX<"X">; +defm "" : LMULSchedWritesVCIX<"IV">; +defm "" : LMULSchedWritesVCIX<"VV">; +defm "" : LMULSchedWritesVCIX<"XV">; +defm "" : LMULSchedWritesVCIX<"IVV">; +defm "" : LMULSchedWritesVCIX<"IVW">; +defm "" : LMULSchedWritesVCIX<"VVV">; +defm "" : LMULSchedWritesVCIX<"VVW">; +defm "" : LMULSchedWritesVCIX<"XVV">; +defm "" : LMULSchedWritesVCIX<"XVW">; +foreach f = ["FPR16", "FPR32", "FPR64"] in { + defm "" : LMULSchedWritesVCIX; + defm "" : LMULSchedWritesVCIX; + defm "" : LMULSchedWritesVCIX; +} + +multiclass LMULWriteResVCIX resources>{ +defm : LMULWriteRes<"WriteVC_" # id, resources>; +defm : LMULWriteRes<"WriteVC_V_" # id, resources>; +} + +multiclass UnsupportedSchedXsfvcp { +let Unsupported = true in { +defm : LMULWriteResVCIX<"I", []>; +defm : LMULWriteResVCIX<"X", []>; +defm : LMULWriteResVCIX<"IV", []>; +defm : LMULWriteResVCIX<"VV", []>; +defm : LMULWriteResVCIX<"XV", []>; +defm : LMULWriteResVCIX<"IVV", []>; +defm : LMULWriteResVCIX<"IVW", []>; +defm : LMULWriteResVCIX<"VVV", []>; +defm : LMULWriteResVCIX<"VVW", []>; +defm : LMULWriteResVCIX<"XVV", []>; +defm : LMULWriteResVCIX<"XVW", []>; +foreach f = ["FPR16", "FPR32", "FPR64"] in { + defm : LMULWriteResVCIX; + defm : LMULWriteResVCIX; + defm : LMULWriteResVCIX; +} +} +} From e2c91091e537a54f1469610ab407e7c9561e7ffc Mon Sep 17 00:00:00 2001 From: darkbuck Date: Tue, 16 Apr 2024 02:36:12 -0400 Subject: [PATCH 039/300] [GlobalISel] Handle more commutable instructions in `commute_constant_to_rhs` Reviewers: rupprecht, aartbik, cyndyishida, Pierre-vh, aemerson, ftynse, hanhanW, banach-space, jayfoad, nicolasvasilache, daniel-grumberg, arsenm, PeimingLiu, JDevlieghere, matthias-springer Reviewed By: arsenm, Pierre-vh Pull Request: https://github.com/llvm/llvm-project/pull/87424 --- .../include/llvm/Target/GlobalISel/Combine.td | 10 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 38 +- .../combine-commute-fp-const-lhs.mir | 126 +++++ .../combine-commute-int-const-lhs.mir | 456 ++++++++++++++++++ .../combine-const-fold-barrier-rhs.mir | 160 ++++++ ...relegalizer-combiner-select-to-fminmax.mir | 16 +- .../AArch64/GlobalISel/select-to-fmin-fmax.ll | 16 +- .../GlobalISel/combine-fcanonicalize.mir | 6 +- .../GlobalISel/postlegalizercombiner-and.mir | 2 +- llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll | 4 +- llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll | 4 +- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 18 +- 12 files changed, 815 insertions(+), 41 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 778ff7e437eb50..8568a7ae90e56c 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -443,14 +443,20 @@ def select_constant_cmp: GICombineRule< // TODO: handle compares (currently not marked as isCommutable) def commute_int_constant_to_rhs : GICombineRule< (defs root:$root), - (match (wip_match_opcode G_ADD, G_MUL, G_AND, G_OR, G_XOR):$root, + (match (wip_match_opcode G_ADD, G_MUL, G_AND, G_OR, G_XOR, + G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_UADDO, G_SADDO, + G_UMULO, G_SMULO, G_UMULH, G_SMULH, + G_UADDSAT, G_SADDSAT, G_SMULFIX, G_UMULFIX, + G_SMULFIXSAT, G_UMULFIXSAT):$root, [{ return Helper.matchCommuteConstantToRHS(*${root}); }]), (apply [{ Helper.applyCommuteBinOpOperands(*${root}); }]) >; def commute_fp_constant_to_rhs : GICombineRule< (defs root:$root), - (match (wip_match_opcode G_FADD, G_FMUL):$root, + (match (wip_match_opcode G_FADD, G_FMUL, G_FMINNUM, G_FMAXNUM, + G_FMINNUM_IEEE, G_FMAXNUM_IEEE, + G_FMINIMUM, G_FMAXIMUM):$root, [{ return Helper.matchCommuteFPConstantToRHS(*${root}); }]), (apply [{ Helper.applyCommuteBinOpOperands(*${root}); }]) >; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 40c5119ee7fb3b..3829c33369b275 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6273,8 +6273,21 @@ bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) { } bool CombinerHelper::matchCommuteConstantToRHS(MachineInstr &MI) { - Register LHS = MI.getOperand(1).getReg(); - Register RHS = MI.getOperand(2).getReg(); + unsigned LHSOpndIdx = 1; + unsigned RHSOpndIdx = 2; + switch (MI.getOpcode()) { + case TargetOpcode::G_UADDO: + case TargetOpcode::G_SADDO: + case TargetOpcode::G_UMULO: + case TargetOpcode::G_SMULO: + LHSOpndIdx = 2; + RHSOpndIdx = 3; + break; + default: + break; + } + Register LHS = MI.getOperand(LHSOpndIdx).getReg(); + Register RHS = MI.getOperand(RHSOpndIdx).getReg(); if (!getIConstantVRegVal(LHS, MRI)) { // Skip commuting if LHS is not a constant. But, LHS may be a // G_CONSTANT_FOLD_BARRIER. If so we commute as long as we don't already @@ -6300,10 +6313,23 @@ bool CombinerHelper::matchCommuteFPConstantToRHS(MachineInstr &MI) { void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) { Observer.changingInstr(MI); - Register LHSReg = MI.getOperand(1).getReg(); - Register RHSReg = MI.getOperand(2).getReg(); - MI.getOperand(1).setReg(RHSReg); - MI.getOperand(2).setReg(LHSReg); + unsigned LHSOpndIdx = 1; + unsigned RHSOpndIdx = 2; + switch (MI.getOpcode()) { + case TargetOpcode::G_UADDO: + case TargetOpcode::G_SADDO: + case TargetOpcode::G_UMULO: + case TargetOpcode::G_SMULO: + LHSOpndIdx = 2; + RHSOpndIdx = 3; + break; + default: + break; + } + Register LHSReg = MI.getOperand(LHSOpndIdx).getReg(); + Register RHSReg = MI.getOperand(RHSOpndIdx).getReg(); + MI.getOperand(LHSOpndIdx).setReg(RHSReg); + MI.getOperand(RHSOpndIdx).setReg(LHSReg); Observer.changedInstr(MI); } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir index 76d82884a7b1f1..d791660b7a5eb2 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir @@ -116,3 +116,129 @@ body: | $q0 = COPY %mul RET_ReallyLR ... +--- +name: fminnum +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: fminnum + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: %min:_(s32) = G_FMINNUM [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %min(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + %min:_(s32) = G_FMINNUM %cst, %0 + $s0 = COPY %min + RET_ReallyLR +... +--- +name: fmaxnum +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: fmaxnum + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: %max:_(s32) = G_FMAXNUM [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %max(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + %max:_(s32) = G_FMAXNUM %cst, %0 + $s0 = COPY %max + RET_ReallyLR +... +--- +name: fminnum_ieee +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: fminnum_ieee + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: %min:_(s32) = G_FMINNUM_IEEE [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %min(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + %min:_(s32) = G_FMINNUM_IEEE %cst, %0 + $s0 = COPY %min + RET_ReallyLR +... +--- +name: fmaxnum_ieee +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: fmaxnum_ieee + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: %max:_(s32) = G_FMAXNUM_IEEE [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %max(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + %max:_(s32) = G_FMAXNUM_IEEE %cst, %0 + $s0 = COPY %max + RET_ReallyLR +... +--- +name: fminimum +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: fminimum + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: %min:_(s32) = G_FMINIMUM [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %min(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + %min:_(s32) = G_FMINIMUM %cst, %0 + $s0 = COPY %min + RET_ReallyLR +... +--- +name: fmaximum +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: fmaximum + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + ; CHECK-NEXT: %max:_(s32) = G_FMAXIMUM [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %max(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_FCONSTANT float 2.000000e+00 + %max:_(s32) = G_FMAXIMUM %cst, %0 + $s0 = COPY %max + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir new file mode 100644 index 00000000000000..16365494f5f4ec --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir @@ -0,0 +1,456 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +--- +name: add +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: add + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %add:_(s32) = G_ADD [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %add(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 1 + %add:_(s32) = G_ADD %cst, %0 + $s0 = COPY %add + RET_ReallyLR + +... +--- +name: mul +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: mul + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %mul:_(s32) = G_MUL [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %mul(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 3 + %mul:_(s32) = G_MUL %cst, %0 + $s0 = COPY %mul + RET_ReallyLR +... +--- +name: and +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: and + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %and:_(s32) = G_AND [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %and(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 5 + %and:_(s32) = G_AND %cst, %0 + $s0 = COPY %and + RET_ReallyLR +... +--- +name: or +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: or + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %or:_(s32) = G_OR [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %or(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 5 + %or:_(s32) = G_OR %cst, %0 + $s0 = COPY %or + RET_ReallyLR +... +--- +name: xor +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: xor + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %xor:_(s32) = G_XOR [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %xor(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 5 + %xor:_(s32) = G_XOR %cst, %0 + $s0 = COPY %xor + RET_ReallyLR +... +--- +name: smin +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: smin + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: %min:_(s32) = G_SMIN [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %min(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 10 + %min:_(s32) = G_SMIN %cst, %0 + $s0 = COPY %min + RET_ReallyLR +... +--- +name: smax +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: smax + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: %max:_(s32) = G_SMAX [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %max(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 10 + %max:_(s32) = G_SMAX %cst, %0 + $s0 = COPY %max + RET_ReallyLR +... +--- +name: umin +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: umin + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: %min:_(s32) = G_UMIN [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %min(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 10 + %min:_(s32) = G_UMIN %cst, %0 + $s0 = COPY %min + RET_ReallyLR +... +--- +name: umax +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: umax + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: %max:_(s32) = G_UMAX [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %max(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 10 + %max:_(s32) = G_UMAX %cst, %0 + $s0 = COPY %max + RET_ReallyLR +... +--- +name: uaddo +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: uaddo + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %add:_(s32), %overflow:_(s1) = G_UADDO [[COPY]], %cst + ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1) + ; CHECK-NEXT: $s0 = COPY %ret(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 1 + %add:_(s32), %overflow:_(s1) = G_UADDO %cst, %0 + %ret:_(s32) = G_ANYEXT %overflow + $s0 = COPY %ret + RET_ReallyLR + +... +--- +name: saddo +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: saddo + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %add:_(s32), %overflow:_(s1) = G_SADDO [[COPY]], %cst + ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1) + ; CHECK-NEXT: $s0 = COPY %ret(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 1 + %add:_(s32), %overflow:_(s1) = G_SADDO %cst, %0 + %ret:_(s32) = G_ANYEXT %overflow + $s0 = COPY %ret + RET_ReallyLR + +... +--- +name: umulo +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: umulo + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_UMULO [[COPY]], %cst + ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1) + ; CHECK-NEXT: $s0 = COPY %ret(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 3 + %mul:_(s32), %overflow:_(s1) = G_UMULO %cst, %0 + %ret:_(s32) = G_ANYEXT %overflow + $s0 = COPY %ret + RET_ReallyLR +... +--- +name: smulo +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: smulo + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO [[COPY]], %cst + ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1) + ; CHECK-NEXT: $s0 = COPY %ret(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 3 + %mul:_(s32), %overflow:_(s1) = G_SMULO %cst, %0 + %ret:_(s32) = G_ANYEXT %overflow + $s0 = COPY %ret + RET_ReallyLR +... +--- +name: umulh +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: umulh + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %mul:_(s32) = G_UMULH [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %mul(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 3 + %mul:_(s32) = G_UMULH %cst, %0 + $s0 = COPY %mul + RET_ReallyLR +... +--- +name: smulh +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: smulh + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %mul:_(s32) = G_UMULH [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %mul(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 3 + %mul:_(s32) = G_UMULH %cst, %0 + $s0 = COPY %mul + RET_ReallyLR +... +--- +name: uaddsat +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: uaddsat + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %add:_(s32) = G_UADDSAT [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %add(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 1 + %add:_(s32) = G_UADDSAT %cst, %0 + $s0 = COPY %add + RET_ReallyLR + +... +--- +name: saddsat +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: saddsat + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %add:_(s32) = G_SADDSAT [[COPY]], %cst + ; CHECK-NEXT: $s0 = COPY %add(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 1 + %add:_(s32) = G_SADDSAT %cst, %0 + $s0 = COPY %add + RET_ReallyLR + +... +--- +name: smulfix +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: smulfix + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %mul:_(s32) = G_SMULFIX [[COPY]], %cst, 7 + ; CHECK-NEXT: $s0 = COPY %mul(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 3 + %mul:_(s32) = G_SMULFIX %cst, %0, 7 + $s0 = COPY %mul + RET_ReallyLR +... +--- +name: umulfix +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: umulfix + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %mul:_(s32) = G_UMULFIX [[COPY]], %cst, 7 + ; CHECK-NEXT: $s0 = COPY %mul(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 3 + %mul:_(s32) = G_UMULFIX %cst, %0, 7 + $s0 = COPY %mul + RET_ReallyLR +... +--- +name: smulfixsat +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: smulfixsat + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %mul:_(s32) = G_SMULFIXSAT [[COPY]], %cst, 7 + ; CHECK-NEXT: $s0 = COPY %mul(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 3 + %mul:_(s32) = G_SMULFIXSAT %cst, %0, 7 + $s0 = COPY %mul + RET_ReallyLR +... +--- +name: umulfixsat +tracksRegLiveness: true +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: umulfixsat + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: %mul:_(s32) = G_UMULFIXSAT [[COPY]], %cst, 7 + ; CHECK-NEXT: $s0 = COPY %mul(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $s0 + %cst:_(s32) = G_CONSTANT i32 3 + %mul:_(s32) = G_UMULFIXSAT %cst, %0, 7 + $s0 = COPY %mul + RET_ReallyLR +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir index 01e0dce5a661cb..c967e4f2ea5e8c 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir @@ -78,3 +78,163 @@ body: | RET_ReallyLR ... +--- +name: cfb_lhs_smulo +tracksRegLiveness: true +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: cfb_lhs_smulo + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO [[COPY]], %cfb + ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1) + ; CHECK-NEXT: $w0 = COPY %ret(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %0 + %ret:_(s32) = G_ANYEXT %overflow + $w0 = COPY %ret + RET_ReallyLR + +... +--- +name: cfb_lhs_cfb_already_rhs_smulo +tracksRegLiveness: true +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: cfb_lhs_cfb_already_rhs_smulo + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2 + ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cfb2 + ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1) + ; CHECK-NEXT: $w0 = COPY %ret(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + %cst2:_(s32) = G_CONSTANT i32 6 + %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2 + %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cfb2 + %ret:_(s32) = G_ANYEXT %overflow + $w0 = COPY %ret + RET_ReallyLR + +... +--- +name: cfb_lhs_cst_on_rhs_smulo +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: cfb_lhs_cst_on_rhs_smulo + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cst2 + ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1) + ; CHECK-NEXT: $w0 = COPY %ret(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + %cst2:_(s32) = G_CONSTANT i32 6 + %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cst2 + %ret:_(s32) = G_ANYEXT %overflow + $w0 = COPY %ret + RET_ReallyLR + +... +--- +name: cfb_lhs_umulfixsat +tracksRegLiveness: true +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: cfb_lhs_umulfixsat + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %mul:_(s32) = G_UMULFIXSAT [[COPY]], %cfb, 7 + ; CHECK-NEXT: $w0 = COPY %mul(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + %mul:_(s32) = G_UMULFIXSAT %cfb, %0, 7 + $w0 = COPY %mul + RET_ReallyLR + +... +--- +name: cfb_lhs_cfb_already_rhs_umulfixsat +tracksRegLiveness: true +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: cfb_lhs_cfb_already_rhs_umulfixsat + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2 + ; CHECK-NEXT: %add:_(s32) = G_UMULFIXSAT %cfb, %cfb2, 7 + ; CHECK-NEXT: $w0 = COPY %add(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + %cst2:_(s32) = G_CONSTANT i32 2 + %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2 + %add:_(s32) = G_UMULFIXSAT %cfb, %cfb2, 7 + $w0 = COPY %add + RET_ReallyLR + +... +--- +name: cfb_lhs_cst_on_rhs_umulfixsat +alignment: 4 +tracksRegLiveness: true +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: cfb_lhs_cst_on_rhs_umulfixsat + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: %add:_(s32) = G_UMULFIXSAT %cfb, %cst2, 7 + ; CHECK-NEXT: $w0 = COPY %add(s32) + ; CHECK-NEXT: RET_ReallyLR + %0:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst + %cst2:_(s32) = G_CONSTANT i32 2 + %add:_(s32) = G_UMULFIXSAT %cfb, %cst2, 7 + $w0 = COPY %add + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir index 8c4300d9e7329f..03e507f5eaa7fb 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir @@ -11,7 +11,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 - ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[C]], [[COPY]] + ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[COPY]], [[C]] ; CHECK-NEXT: $h0 = COPY [[FMAXIMUM]](s16) ; CHECK-NEXT: RET_ReallyLR implicit $h0 %0:_(s16) = COPY $h0 @@ -33,7 +33,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 - ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[C]], [[COPY]] + ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[C]] ; CHECK-NEXT: $s0 = COPY [[FMAXIMUM]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $s0 %0:_(s32) = COPY $s0 @@ -55,7 +55,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 - ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[C]], [[COPY]] + ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[COPY]], [[C]] ; CHECK-NEXT: $d0 = COPY [[FMAXIMUM]](s64) ; CHECK-NEXT: RET_ReallyLR implicit $d0 %0:_(s64) = COPY $d0 @@ -77,7 +77,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 - ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[C]], [[COPY]] + ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[COPY]], [[C]] ; CHECK-NEXT: $d0 = COPY [[FMINIMUM]](s64) ; CHECK-NEXT: RET_ReallyLR implicit $d0 %0:_(s64) = COPY $d0 @@ -100,7 +100,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) - ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<8 x s16>) = G_FMAXIMUM [[BUILD_VECTOR]], [[COPY]] + ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<8 x s16>) = G_FMAXIMUM [[COPY]], [[BUILD_VECTOR]] ; CHECK-NEXT: $q0 = COPY [[FMAXIMUM]](<8 x s16>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<8 x s16>) = COPY $q0 @@ -125,7 +125,7 @@ body: | ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) - ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<4 x s32>) = G_FMAXIMUM [[BUILD_VECTOR]], [[BITCAST]] + ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<4 x s32>) = G_FMAXIMUM [[BITCAST]], [[BUILD_VECTOR]] ; CHECK-NEXT: $q0 = COPY [[FMAXIMUM]](<4 x s32>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %1:_(<2 x s64>) = COPY $q0 @@ -150,7 +150,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) - ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMAXIMUM [[BUILD_VECTOR]], [[COPY]] + ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMAXIMUM [[COPY]], [[BUILD_VECTOR]] ; CHECK-NEXT: $q0 = COPY [[FMAXIMUM]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<2 x s64>) = COPY $q0 @@ -174,7 +174,7 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64) - ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMINIMUM [[BUILD_VECTOR]], [[COPY]] + ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMINIMUM [[COPY]], [[BUILD_VECTOR]] ; CHECK-NEXT: $q0 = COPY [[FMINIMUM]](<2 x s64>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<2 x s64>) = COPY $q0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll b/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll index 7badf4732fd0d4..ae0a9b1c7c4f1f 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll @@ -4,7 +4,7 @@ define half @test_s16(half %a) #0 { ; CHECK-LABEL: test_s16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: fmax h0, h1, h0 +; CHECK-NEXT: fmax h0, h0, h1 ; CHECK-NEXT: ret entry: %fcmp = fcmp olt half %a, 0.0 @@ -16,7 +16,7 @@ define float @test_s32(float %a) #0 { ; CHECK-LABEL: test_s32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: fmax s0, s1, s0 +; CHECK-NEXT: fmax s0, s0, s1 ; CHECK-NEXT: ret entry: %fcmp = fcmp olt float %a, 0.0 @@ -28,7 +28,7 @@ define double @test_s64(double %a) #0 { ; CHECK-LABEL: test_s64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: fmax d0, d1, d0 +; CHECK-NEXT: fmax d0, d0, d1 ; CHECK-NEXT: ret entry: %fcmp = fcmp olt double %a, 0.0 @@ -40,7 +40,7 @@ define <4 x half> @test_v4s16(<4 x half> %a) #0 { ; CHECK-LABEL: test_v4s16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: fmax v0.4h, v1.4h, v0.4h +; CHECK-NEXT: fmax v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret entry: %fcmp = fcmp olt <4 x half> %a, zeroinitializer @@ -52,7 +52,7 @@ define <8 x half> @test_v8s16(<8 x half> %a) #0 { ; CHECK-LABEL: test_v8s16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: fmax v0.8h, v1.8h, v0.8h +; CHECK-NEXT: fmax v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret entry: %fcmp = fcmp olt <8 x half> %a, zeroinitializer @@ -64,7 +64,7 @@ define <2 x float> @test_v2s32(<2 x float> %a) #0 { ; CHECK-LABEL: test_v2s32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: fmax v0.2s, v1.2s, v0.2s +; CHECK-NEXT: fmax v0.2s, v0.2s, v1.2s ; CHECK-NEXT: ret entry: %fcmp = fcmp olt <2 x float> %a, zeroinitializer @@ -76,7 +76,7 @@ define <4 x float> @test_v4s32(<4 x float> %a) #0 { ; CHECK-LABEL: test_v4s32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: fmax v0.4s, v1.4s, v0.4s +; CHECK-NEXT: fmax v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret entry: %fcmp = fcmp olt <4 x float> %a, zeroinitializer @@ -88,7 +88,7 @@ define <2 x double> @test_v2s64(<2 x double> %a) #0 { ; CHECK-LABEL: test_v2s64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: fmax v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmax v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %fcmp = fcmp olt <2 x double> %a, zeroinitializer diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir index ee0e83c5e07632..020761352148f2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir @@ -254,8 +254,8 @@ body: | ; CHECK-NEXT: %one_s32:_(s32) = G_ANYEXT %one(s16) ; CHECK-NEXT: %one_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %one_s32(s32), %undef(s32) ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat - ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef, [[FMUL]] - ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %one_undef, [[FMAXNUM_IEEE]] + ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FMUL]], %zero_undef + ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FMAXNUM_IEEE]], %one_undef ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %two:_(s16) = G_FCONSTANT half 0xH4000 @@ -306,7 +306,7 @@ body: | ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat ; CHECK-NEXT: %snan_undef_fcan:_(<2 x s16>) = G_FCANONICALIZE %snan_undef ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %snan_undef_fcan, [[FMUL]] - ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %qnan_undef, [[FMAXNUM_IEEE]] + ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FMAXNUM_IEEE]], %qnan_undef ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %two:_(s16) = G_FCONSTANT half 0xH4000 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir index d6321dae3aa7e5..67e6de1ce76449 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir @@ -318,7 +318,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %val:_(s32) = COPY $vgpr4 ; CHECK-NEXT: %k255:_(s32) = G_CONSTANT i32 255 - ; CHECK-NEXT: %umin0:_(s32) = G_UMIN %k255, %val + ; CHECK-NEXT: %umin0:_(s32) = G_UMIN %val, %k255 ; CHECK-NEXT: $vgpr0 = COPY %umin0(s32) %ptr0:_(p1) = COPY $vgpr0_vgpr1 %ptr1:_(p1) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll index dc13dee4f148ac..1d94d76da148f7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll @@ -145,10 +145,10 @@ define <2 x i16> @test_max_K0min_K1Val__v2i16(<2 x i16> %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 17 ; GFX8-NEXT: v_min_i16_e32 v1, 17, v0 -; GFX8-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v2, -12 ; GFX8-NEXT: v_max_i16_e32 v1, -12, v1 -; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll index 7e38762e7b559c..a8233054db9bc6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll @@ -145,10 +145,10 @@ define <2 x i16> @test_max_K0min_K1Val__v2u16(<2 x i16> %a) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, 17 ; GFX8-NEXT: v_min_u16_e32 v1, 17, v0 -; GFX8-NEXT: v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v2, 12 ; GFX8-NEXT: v_max_u16_e32 v1, 12, v1 -; GFX8-NEXT: v_max_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_max_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 07480a0ce0c2e7..cc0f7e2ca5a54c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -983,7 +983,7 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_mul_lo_u32 v6, v4, v5 ; CHECK-NEXT: v_mul_lo_u32 v7, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_mul_lo_u32 v8, v4, v7 @@ -1010,7 +1010,7 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc ; CHECK-NEXT: v_mul_lo_u32 v6, v3, v5 -; CHECK-NEXT: v_mul_hi_u32 v7, v5, v3 +; CHECK-NEXT: v_mul_hi_u32 v7, v3, v5 ; CHECK-NEXT: v_mul_lo_u32 v5, v4, v5 ; CHECK-NEXT: v_mul_lo_u32 v8, v4, v6 ; CHECK-NEXT: v_mul_hi_u32 v9, v3, v6 @@ -1058,7 +1058,7 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5 ; CHECK-NEXT: v_mul_lo_u32 v6, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v3, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; CHECK-NEXT: v_mul_lo_u32 v4, v4, v2 ; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3 @@ -1265,10 +1265,10 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4 -; GISEL-NEXT: v_mul_hi_u32 v9, v4, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v9, v4 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 ; GISEL-NEXT: v_mul_lo_u32 v12, v5, v4 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v5, v4 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_mul_lo_u32 v7, v7, v4 @@ -1339,7 +1339,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CGP-NEXT: v_mul_lo_u32 v8, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v9, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v5, v7 ; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v5 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CGP-NEXT: v_mul_lo_u32 v10, v6, v9 @@ -1366,7 +1366,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc ; CGP-NEXT: v_mul_lo_u32 v8, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v9, v7, v5 +; CGP-NEXT: v_mul_hi_u32 v9, v5, v7 ; CGP-NEXT: v_mul_lo_u32 v7, v6, v7 ; CGP-NEXT: v_mul_lo_u32 v10, v6, v8 ; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 @@ -1433,10 +1433,10 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CGP-NEXT: v_mul_lo_u32 v9, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v4, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v7, v4 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_mul_lo_u32 v11, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v5, v4 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8 ; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; CGP-NEXT: v_mul_lo_u32 v8, v8, v4 From 3c6f91e5b671321c95259dabecdbdfe4a6d69ce1 Mon Sep 17 00:00:00 2001 From: martinboehme Date: Tue, 16 Apr 2024 08:49:45 +0200 Subject: [PATCH 040/300] [clang][dataflow] Fix result object location for builtin `<=>`. (#88726) The newly added test causes an assertion failure in `PropagateResultObject()` without the fix added here. --- .../FlowSensitive/DataflowEnvironment.cpp | 5 ++ .../Analysis/FlowSensitive/TransferTest.cpp | 52 +++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp index bea15ce9bd24d1..ee2581143e1141 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp @@ -508,6 +508,11 @@ class ResultObjectVisitor : public RecursiveASTVisitor { isa(E)) { return; } + if (auto *Op = dyn_cast(E); + Op && Op->getOpcode() == BO_Cmp) { + // Builtin `<=>` returns a `std::strong_ordering` object. + return; + } if (auto *InitList = dyn_cast(E)) { if (!InitList->isSemanticForm()) diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp index 00dafb2988c690..d8bcc3da4b8b1c 100644 --- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp @@ -3098,6 +3098,58 @@ TEST(TransferTest, ResultObjectLocationForCXXOperatorCallExpr) { }); } +// Check that the `std::strong_ordering` object returned by builtin `<=>` has a +// correctly modeled result object location. +TEST(TransferTest, ResultObjectLocationForBuiltinSpaceshipOperator) { + std::string Code = R"( + namespace std { + // This is the minimal definition required to get + // `Sema::CheckComparisonCategoryType()` to accept this fake. + struct strong_ordering { + enum class ordering { less, equal, greater }; + ordering o; + static const strong_ordering less; + static const strong_ordering equivalent; + static const strong_ordering equal; + static const strong_ordering greater; + }; + + inline constexpr strong_ordering strong_ordering::less = + { strong_ordering::ordering::less }; + inline constexpr strong_ordering strong_ordering::equal = + { strong_ordering::ordering::equal }; + inline constexpr strong_ordering strong_ordering::equivalent = + { strong_ordering::ordering::equal }; + inline constexpr strong_ordering strong_ordering::greater = + { strong_ordering::ordering::greater }; + } + void target(int i, int j) { + auto ordering = i <=> j; + // [[p]] + } + )"; + using ast_matchers::binaryOperator; + using ast_matchers::hasOperatorName; + using ast_matchers::match; + using ast_matchers::selectFirst; + using ast_matchers::traverse; + runDataflow( + Code, + [](const llvm::StringMap> &Results, + ASTContext &ASTCtx) { + const Environment &Env = getEnvironmentAtAnnotation(Results, "p"); + + auto *Spaceship = selectFirst( + "op", + match(binaryOperator(hasOperatorName("<=>")).bind("op"), ASTCtx)); + + EXPECT_EQ( + &Env.getResultObjectLocation(*Spaceship), + &getLocForDecl(ASTCtx, Env, "ordering")); + }, + LangStandard::lang_cxx20); +} + TEST(TransferTest, ResultObjectLocationForStdInitializerListExpr) { std::string Code = R"( namespace std { From 7e49b0d5a67f212e84f8ec0ec2e39a6a8673bfaf Mon Sep 17 00:00:00 2001 From: Daniil Kovalev Date: Tue, 16 Apr 2024 09:53:33 +0300 Subject: [PATCH 041/300] [lldb] Fix nullptr dereference on running x86 binary with x86-disabled llvm (#82603) If `LLVM_TARGETS_TO_BUILD` does not contain `X86` and we try to run an x86 binary in lldb, we get a `nullptr` dereference in `LLVMDisasmInstruction(...)`. We try to call `getDisAsm()` method on a `LLVMDisasmContext *DC` which is null. The pointer is passed from `x86AssemblyInspectionEngine::instruction_length(...)` and is originally `m_disasm_context` member of `x86AssemblyInspectionEngine`. This should be filled by `LLVMCreateDisasm(...)` in the class constructor, but not having X86 target enabled in llvm makes `TargetRegistry::lookupTarget(...)` call return `nullptr`, which results in `m_disasm_context` initialized with `nullptr` as well. This patch adds if statements against `m_disasm_context` in `x86AssemblyInspectionEngine::GetNonCallSiteUnwindPlanFromAssembly(...)` and `x86AssemblyInspectionEngine::FindFirstNonPrologueInstruction(...)` so subsequent calls to `x86AssemblyInspectionEngine::instruction_length(...)` do not cause a null pointer dereference. --- .../x86/x86AssemblyInspectionEngine.cpp | 6 + lldb/unittests/UnwindAssembly/CMakeLists.txt | 4 + .../x86-but-no-x86-target/CMakeLists.txt | 10 ++ .../Testx86AssemblyInspectionEngine.cpp | 103 ++++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 lldb/unittests/UnwindAssembly/x86-but-no-x86-target/CMakeLists.txt create mode 100644 lldb/unittests/UnwindAssembly/x86-but-no-x86-target/Testx86AssemblyInspectionEngine.cpp diff --git a/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp b/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp index 2032c5a68d054c..6bfaa54135a959 100644 --- a/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp +++ b/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp @@ -909,6 +909,9 @@ bool x86AssemblyInspectionEngine::GetNonCallSiteUnwindPlanFromAssembly( if (!m_register_map_initialized) return false; + if (m_disasm_context == nullptr) + return false; + addr_t current_func_text_offset = 0; int current_sp_bytes_offset_from_fa = 0; bool is_aligned = false; @@ -1570,6 +1573,9 @@ bool x86AssemblyInspectionEngine::FindFirstNonPrologueInstruction( if (!m_register_map_initialized) return false; + if (m_disasm_context == nullptr) + return false; + while (offset < size) { int regno; int insn_len; diff --git a/lldb/unittests/UnwindAssembly/CMakeLists.txt b/lldb/unittests/UnwindAssembly/CMakeLists.txt index 136fcd9ae97981..d6e4471af4ecb3 100644 --- a/lldb/unittests/UnwindAssembly/CMakeLists.txt +++ b/lldb/unittests/UnwindAssembly/CMakeLists.txt @@ -9,3 +9,7 @@ endif() if ("X86" IN_LIST LLVM_TARGETS_TO_BUILD) add_subdirectory(x86) endif() + +if (NOT "X86" IN_LIST LLVM_TARGETS_TO_BUILD) + add_subdirectory(x86-but-no-x86-target) +endif() diff --git a/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/CMakeLists.txt b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/CMakeLists.txt new file mode 100644 index 00000000000000..d28e9629a64cfc --- /dev/null +++ b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/CMakeLists.txt @@ -0,0 +1,10 @@ +add_lldb_unittest(UnwindAssemblyX86ButNoX86TargetTests + Testx86AssemblyInspectionEngine.cpp + LINK_LIBS + lldbCore + lldbSymbol + lldbPluginUnwindAssemblyX86 + LINK_COMPONENTS + Support + ${LLVM_TARGETS_TO_BUILD} + ) diff --git a/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/Testx86AssemblyInspectionEngine.cpp b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/Testx86AssemblyInspectionEngine.cpp new file mode 100644 index 00000000000000..ed093d146440e3 --- /dev/null +++ b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/Testx86AssemblyInspectionEngine.cpp @@ -0,0 +1,103 @@ +//===-- Testx86AssemblyInspectionEngine.cpp -------------------------------===// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "gtest/gtest.h" + +#include "Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.h" +#include "lldb/Core/AddressRange.h" +#include "lldb/Symbol/UnwindPlan.h" +#include "lldb/Utility/ArchSpec.h" + +#include "llvm/Support/TargetSelect.h" + +#include +#include + +using namespace lldb; +using namespace lldb_private; + +class Testx86AssemblyInspectionEngine : public testing::Test { +public: + static void SetUpTestCase(); +}; + +void Testx86AssemblyInspectionEngine::SetUpTestCase() { + llvm::InitializeAllTargets(); + llvm::InitializeAllAsmPrinters(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllDisassemblers(); +} + +// only defining the register names / numbers that the unwinder is actually +// using today + +// names should match the constants below. These will be the eRegisterKindLLDB +// register numbers. + +const char *x86_64_reg_names[] = {"rax", "rbx", "rcx", "rdx", "rsp", "rbp", + "rsi", "rdi", "r8", "r9", "r10", "r11", + "r12", "r13", "r14", "r15", "rip"}; + +enum x86_64_regs { + k_rax = 0, + k_rbx = 1, + k_rcx = 2, + k_rdx = 3, + k_rsp = 4, + k_rbp = 5, + k_rsi = 6, + k_rdi = 7, + k_r8 = 8, + k_r9 = 9, + k_r10 = 10, + k_r11 = 11, + k_r12 = 12, + k_r13 = 13, + k_r14 = 14, + k_r15 = 15, + k_rip = 16 +}; + +std::unique_ptr Getx86_64Inspector() { + + ArchSpec arch("x86_64-apple-macosx"); + std::unique_ptr engine( + new x86AssemblyInspectionEngine(arch)); + + std::vector lldb_regnums; + int i = 0; + for (const auto &name : x86_64_reg_names) { + x86AssemblyInspectionEngine::lldb_reg_info ri; + ri.name = name; + ri.lldb_regnum = i++; + lldb_regnums.push_back(ri); + } + + engine->Initialize(lldb_regnums); + return engine; +} + +TEST_F(Testx86AssemblyInspectionEngine, TestSimple64bitFrameFunction) { + std::unique_ptr engine = Getx86_64Inspector(); + + // 'int main() { }' compiled for x86_64-apple-macosx with clang + uint8_t data[] = { + 0x55, // offset 0 -- pushq %rbp + 0x48, 0x89, 0xe5, // offset 1 -- movq %rsp, %rbp + 0x31, 0xc0, // offset 4 -- xorl %eax, %eax + 0x5d, // offset 6 -- popq %rbp + 0xc3 // offset 7 -- retq + }; + + AddressRange sample_range(0x1000, sizeof(data)); + + UnwindPlan unwind_plan(eRegisterKindLLDB); + EXPECT_FALSE(engine->GetNonCallSiteUnwindPlanFromAssembly( + data, sizeof(data), sample_range, unwind_plan)); +} From 89071f35599ac58cde99923a376d6b75c0d49e4b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 16 Apr 2024 00:09:41 -0700 Subject: [PATCH 042/300] [clang] Drop unaligned from calls to readNext (NFC) (#88842) Now readNext defaults to unaligned accesses. This patch drops unaligned to improve readability. --- clang/lib/APINotes/APINotesReader.cpp | 137 +++++++----------- clang/lib/Serialization/ASTReader.cpp | 87 +++++------ clang/lib/Serialization/GlobalModuleIndex.cpp | 12 +- .../lib/Serialization/MultiOnDiskHashTable.h | 4 +- 4 files changed, 98 insertions(+), 142 deletions(-) diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp index fbbe9c32ce1258..dfc3beb6fa13ee 100644 --- a/clang/lib/APINotes/APINotesReader.cpp +++ b/clang/lib/APINotes/APINotesReader.cpp @@ -30,23 +30,20 @@ namespace { llvm::VersionTuple ReadVersionTuple(const uint8_t *&Data) { uint8_t NumVersions = (*Data++) & 0x03; - unsigned Major = - endian::readNext(Data); + unsigned Major = endian::readNext(Data); if (NumVersions == 0) return llvm::VersionTuple(Major); - unsigned Minor = - endian::readNext(Data); + unsigned Minor = endian::readNext(Data); if (NumVersions == 1) return llvm::VersionTuple(Major, Minor); unsigned Subminor = - endian::readNext(Data); + endian::readNext(Data); if (NumVersions == 2) return llvm::VersionTuple(Major, Minor, Subminor); - unsigned Build = - endian::readNext(Data); + unsigned Build = endian::readNext(Data); return llvm::VersionTuple(Major, Minor, Subminor, Build); } @@ -71,16 +68,16 @@ class VersionedTableInfo { static std::pair ReadKeyDataLength(const uint8_t *&Data) { unsigned KeyLength = - endian::readNext(Data); + endian::readNext(Data); unsigned DataLength = - endian::readNext(Data); + endian::readNext(Data); return {KeyLength, DataLength}; } static data_type ReadData(internal_key_type Key, const uint8_t *Data, unsigned Length) { unsigned NumElements = - endian::readNext(Data); + endian::readNext(Data); data_type Result; Result.reserve(NumElements); for (unsigned i = 0; i != NumElements; ++i) { @@ -105,14 +102,14 @@ void ReadCommonEntityInfo(const uint8_t *&Data, CommonEntityInfo &Info) { Info.setSwiftPrivate(static_cast((UnavailableBits >> 3) & 0x01)); unsigned MsgLength = - endian::readNext(Data); + endian::readNext(Data); Info.UnavailableMsg = std::string(reinterpret_cast(Data), reinterpret_cast(Data) + MsgLength); Data += MsgLength; unsigned SwiftNameLength = - endian::readNext(Data); + endian::readNext(Data); Info.SwiftName = std::string(reinterpret_cast(Data), reinterpret_cast(Data) + SwiftNameLength); @@ -124,7 +121,7 @@ void ReadCommonTypeInfo(const uint8_t *&Data, CommonTypeInfo &Info) { ReadCommonEntityInfo(Data, Info); unsigned SwiftBridgeLength = - endian::readNext(Data); + endian::readNext(Data); if (SwiftBridgeLength > 0) { Info.setSwiftBridge(std::string(reinterpret_cast(Data), SwiftBridgeLength - 1)); @@ -132,7 +129,7 @@ void ReadCommonTypeInfo(const uint8_t *&Data, CommonTypeInfo &Info) { } unsigned ErrorDomainLength = - endian::readNext(Data); + endian::readNext(Data); if (ErrorDomainLength > 0) { Info.setNSErrorDomain(std::optional(std::string( reinterpret_cast(Data), ErrorDomainLength - 1))); @@ -163,9 +160,9 @@ class IdentifierTableInfo { static std::pair ReadKeyDataLength(const uint8_t *&Data) { unsigned KeyLength = - endian::readNext(Data); + endian::readNext(Data); unsigned DataLength = - endian::readNext(Data); + endian::readNext(Data); return {KeyLength, DataLength}; } @@ -175,8 +172,7 @@ class IdentifierTableInfo { static data_type ReadData(internal_key_type key, const uint8_t *Data, unsigned Length) { - return endian::readNext( - Data); + return endian::readNext(Data); } }; @@ -203,26 +199,24 @@ class ObjCContextIDTableInfo { static std::pair ReadKeyDataLength(const uint8_t *&Data) { unsigned KeyLength = - endian::readNext(Data); + endian::readNext(Data); unsigned DataLength = - endian::readNext(Data); + endian::readNext(Data); return {KeyLength, DataLength}; } static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { auto ParentCtxID = - endian::readNext(Data); + endian::readNext(Data); auto ContextKind = - endian::readNext(Data); - auto NameID = - endian::readNext(Data); + endian::readNext(Data); + auto NameID = endian::readNext(Data); return {ParentCtxID, ContextKind, NameID}; } static data_type ReadData(internal_key_type Key, const uint8_t *Data, unsigned Length) { - return endian::readNext( - Data); + return endian::readNext(Data); } }; @@ -232,8 +226,7 @@ class ObjCContextInfoTableInfo ObjCContextInfo> { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { - return endian::readNext( - Data); + return endian::readNext(Data); } hash_value_type ComputeHash(internal_key_type Key) { @@ -273,8 +266,7 @@ void ReadVariableInfo(const uint8_t *&Data, VariableInfo &Info) { } ++Data; - auto TypeLen = - endian::readNext(Data); + auto TypeLen = endian::readNext(Data); Info.setType(std::string(Data, Data + TypeLen)); Data += TypeLen; } @@ -286,12 +278,9 @@ class ObjCPropertyTableInfo ObjCPropertyInfo> { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { - auto ClassID = - endian::readNext(Data); - auto NameID = - endian::readNext(Data); - char IsInstance = - endian::readNext(Data); + auto ClassID = endian::readNext(Data); + auto NameID = endian::readNext(Data); + char IsInstance = endian::readNext(Data); return {ClassID, NameID, IsInstance}; } @@ -314,8 +303,7 @@ class ObjCPropertyTableInfo void ReadParamInfo(const uint8_t *&Data, ParamInfo &Info) { ReadVariableInfo(Data, Info); - uint8_t Payload = - endian::readNext(Data); + uint8_t Payload = endian::readNext(Data); if (auto RawConvention = Payload & 0x7) { auto Convention = static_cast(RawConvention - 1); Info.setRetainCountConvention(Convention); @@ -331,8 +319,7 @@ void ReadParamInfo(const uint8_t *&Data, ParamInfo &Info) { void ReadFunctionInfo(const uint8_t *&Data, FunctionInfo &Info) { ReadCommonEntityInfo(Data, Info); - uint8_t Payload = - endian::readNext(Data); + uint8_t Payload = endian::readNext(Data); if (auto RawConvention = Payload & 0x7) { auto Convention = static_cast(RawConvention - 1); Info.setRetainCountConvention(Convention); @@ -343,12 +330,12 @@ void ReadFunctionInfo(const uint8_t *&Data, FunctionInfo &Info) { assert(Payload == 0 && "Bad API notes"); Info.NumAdjustedNullable = - endian::readNext(Data); + endian::readNext(Data); Info.NullabilityPayload = - endian::readNext(Data); + endian::readNext(Data); unsigned NumParams = - endian::readNext(Data); + endian::readNext(Data); while (NumParams > 0) { ParamInfo pi; ReadParamInfo(Data, pi); @@ -357,7 +344,7 @@ void ReadFunctionInfo(const uint8_t *&Data, FunctionInfo &Info) { } unsigned ResultTypeLen = - endian::readNext(Data); + endian::readNext(Data); Info.ResultType = std::string(Data, Data + ResultTypeLen); Data += ResultTypeLen; } @@ -369,12 +356,10 @@ class ObjCMethodTableInfo ObjCMethodInfo> { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { - auto ClassID = - endian::readNext(Data); + auto ClassID = endian::readNext(Data); auto SelectorID = - endian::readNext(Data); - auto IsInstance = - endian::readNext(Data); + endian::readNext(Data); + auto IsInstance = endian::readNext(Data); return {ClassID, SelectorID, IsInstance}; } @@ -419,29 +404,26 @@ class ObjCSelectorTableInfo { static std::pair ReadKeyDataLength(const uint8_t *&Data) { unsigned KeyLength = - endian::readNext(Data); + endian::readNext(Data); unsigned DataLength = - endian::readNext(Data); + endian::readNext(Data); return {KeyLength, DataLength}; } static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { internal_key_type Key; - Key.NumArgs = - endian::readNext(Data); + Key.NumArgs = endian::readNext(Data); unsigned NumIdents = (Length - sizeof(uint16_t)) / sizeof(uint32_t); for (unsigned i = 0; i != NumIdents; ++i) { Key.Identifiers.push_back( - endian::readNext( - Data)); + endian::readNext(Data)); } return Key; } static data_type ReadData(internal_key_type Key, const uint8_t *Data, unsigned Length) { - return endian::readNext( - Data); + return endian::readNext(Data); } }; @@ -451,12 +433,10 @@ class GlobalVariableTableInfo GlobalVariableInfo> { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { - auto CtxID = - endian::readNext(Data); + auto CtxID = endian::readNext(Data); auto ContextKind = - endian::readNext(Data); - auto NameID = - endian::readNext(Data); + endian::readNext(Data); + auto NameID = endian::readNext(Data); return {CtxID, ContextKind, NameID}; } @@ -478,12 +458,10 @@ class GlobalFunctionTableInfo GlobalFunctionInfo> { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { - auto CtxID = - endian::readNext(Data); + auto CtxID = endian::readNext(Data); auto ContextKind = - endian::readNext(Data); - auto NameID = - endian::readNext(Data); + endian::readNext(Data); + auto NameID = endian::readNext(Data); return {CtxID, ContextKind, NameID}; } @@ -505,8 +483,7 @@ class EnumConstantTableInfo EnumConstantInfo> { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { - auto NameID = - endian::readNext(Data); + auto NameID = endian::readNext(Data); return NameID; } @@ -527,13 +504,11 @@ class TagTableInfo : public VersionedTableInfo { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { - auto CtxID = - endian::readNext(Data); + auto CtxID = endian::readNext(Data); auto ContextKind = - endian::readNext(Data); + endian::readNext(Data); auto NameID = - endian::readNext( - Data); + endian::readNext(Data); return {CtxID, ContextKind, NameID}; } @@ -553,21 +528,21 @@ class TagTableInfo static_cast((Payload & 0x3) - 1); unsigned ImportAsLength = - endian::readNext(Data); + endian::readNext(Data); if (ImportAsLength > 0) { Info.SwiftImportAs = std::string(reinterpret_cast(Data), ImportAsLength - 1); Data += ImportAsLength - 1; } unsigned RetainOpLength = - endian::readNext(Data); + endian::readNext(Data); if (RetainOpLength > 0) { Info.SwiftRetainOp = std::string(reinterpret_cast(Data), RetainOpLength - 1); Data += RetainOpLength - 1; } unsigned ReleaseOpLength = - endian::readNext(Data); + endian::readNext(Data); if (ReleaseOpLength > 0) { Info.SwiftReleaseOp = std::string(reinterpret_cast(Data), ReleaseOpLength - 1); @@ -585,13 +560,11 @@ class TypedefTableInfo TypedefInfo> { public: static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) { - auto CtxID = - endian::readNext(Data); + auto CtxID = endian::readNext(Data); auto ContextKind = - endian::readNext(Data); + endian::readNext(Data); auto nameID = - endian::readNext( - Data); + endian::readNext(Data); return {CtxID, ContextKind, nameID}; } diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 8c4b460970ad2b..feb60bc54413a5 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -915,10 +915,9 @@ ASTSelectorLookupTrait::ReadKey(const unsigned char* d, unsigned) { using namespace llvm::support; SelectorTable &SelTable = Reader.getContext().Selectors; - unsigned N = - endian::readNext(d); + unsigned N = endian::readNext(d); const IdentifierInfo *FirstII = Reader.getLocalIdentifier( - F, endian::readNext(d)); + F, endian::readNext(d)); if (N == 0) return SelTable.getNullarySelector(FirstII); else if (N == 1) @@ -928,7 +927,7 @@ ASTSelectorLookupTrait::ReadKey(const unsigned char* d, unsigned) { Args.push_back(FirstII); for (unsigned I = 1; I != N; ++I) Args.push_back(Reader.getLocalIdentifier( - F, endian::readNext(d))); + F, endian::readNext(d))); return SelTable.getSelector(N, Args.data()); } @@ -941,11 +940,11 @@ ASTSelectorLookupTrait::ReadData(Selector, const unsigned char* d, data_type Result; Result.ID = Reader.getGlobalSelectorID( - F, endian::readNext(d)); + F, endian::readNext(d)); unsigned FullInstanceBits = - endian::readNext(d); + endian::readNext(d); unsigned FullFactoryBits = - endian::readNext(d); + endian::readNext(d); Result.InstanceBits = FullInstanceBits & 0x3; Result.InstanceHasMoreThanOneDecl = (FullInstanceBits >> 2) & 0x1; Result.FactoryBits = FullFactoryBits & 0x3; @@ -956,16 +955,14 @@ ASTSelectorLookupTrait::ReadData(Selector, const unsigned char* d, // Load instance methods for (unsigned I = 0; I != NumInstanceMethods; ++I) { if (ObjCMethodDecl *Method = Reader.GetLocalDeclAs( - F, - endian::readNext(d))) + F, endian::readNext(d))) Result.Instance.push_back(Method); } // Load factory methods for (unsigned I = 0; I != NumFactoryMethods; ++I) { if (ObjCMethodDecl *Method = Reader.GetLocalDeclAs( - F, - endian::readNext(d))) + F, endian::readNext(d))) Result.Factory.push_back(Method); } @@ -1009,8 +1006,7 @@ static bool readBit(unsigned &Bits) { IdentID ASTIdentifierLookupTrait::ReadIdentifierID(const unsigned char *d) { using namespace llvm::support; - unsigned RawID = - endian::readNext(d); + unsigned RawID = endian::readNext(d); return Reader.getGlobalIdentifierID(F, RawID >> 1); } @@ -1028,8 +1024,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k, unsigned DataLen) { using namespace llvm::support; - unsigned RawID = - endian::readNext(d); + unsigned RawID = endian::readNext(d); bool IsInteresting = RawID & 0x01; // Wipe out the "is interesting" bit. @@ -1053,9 +1048,8 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k, } unsigned ObjCOrBuiltinID = - endian::readNext(d); - unsigned Bits = - endian::readNext(d); + endian::readNext(d); + unsigned Bits = endian::readNext(d); bool CPlusPlusOperatorKeyword = readBit(Bits); bool HasRevertedTokenIDToIdentifier = readBit(Bits); bool Poisoned = readBit(Bits); @@ -1084,7 +1078,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k, // definition. if (HadMacroDefinition) { uint32_t MacroDirectivesOffset = - endian::readNext(d); + endian::readNext(d); DataLen -= 4; Reader.addPendingMacro(II, &F, MacroDirectivesOffset); @@ -1098,8 +1092,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k, SmallVector DeclIDs; for (; DataLen > 0; DataLen -= 4) DeclIDs.push_back(Reader.getGlobalDeclID( - F, - endian::readNext(d))); + F, endian::readNext(d))); Reader.SetGloballyVisibleDecls(II, DeclIDs); } @@ -1169,7 +1162,7 @@ ASTDeclContextNameLookupTrait::ReadFileRef(const unsigned char *&d) { using namespace llvm::support; uint32_t ModuleFileID = - endian::readNext(d); + endian::readNext(d); return Reader.getLocalModuleFile(F, ModuleFileID); } @@ -1189,18 +1182,15 @@ ASTDeclContextNameLookupTrait::ReadKey(const unsigned char *d, unsigned) { case DeclarationName::CXXLiteralOperatorName: case DeclarationName::CXXDeductionGuideName: Data = (uint64_t)Reader.getLocalIdentifier( - F, endian::readNext(d)); + F, endian::readNext(d)); break; case DeclarationName::ObjCZeroArgSelector: case DeclarationName::ObjCOneArgSelector: case DeclarationName::ObjCMultiArgSelector: - Data = - (uint64_t)Reader - .getLocalSelector( - F, - endian::readNext( - d)) - .getAsOpaquePtr(); + Data = (uint64_t)Reader + .getLocalSelector( + F, endian::readNext(d)) + .getAsOpaquePtr(); break; case DeclarationName::CXXOperatorName: Data = *d++; // OverloadedOperatorKind @@ -1223,8 +1213,7 @@ void ASTDeclContextNameLookupTrait::ReadDataInto(internal_key_type, using namespace llvm::support; for (unsigned NumDecls = DataLen / 4; NumDecls; --NumDecls) { - uint32_t LocalID = - endian::readNext(d); + uint32_t LocalID = endian::readNext(d); Val.insert(Reader.getGlobalDeclID(F, LocalID)); } } @@ -2033,10 +2022,9 @@ HeaderFileInfoTrait::ReadKey(const unsigned char *d, unsigned) { using namespace llvm::support; internal_key_type ikey; - ikey.Size = - off_t(endian::readNext(d)); - ikey.ModTime = time_t( - endian::readNext(d)); + ikey.Size = off_t(endian::readNext(d)); + ikey.ModTime = + time_t(endian::readNext(d)); ikey.Filename = (const char *)d; ikey.Imported = true; return ikey; @@ -2064,9 +2052,9 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d, HFI.DirInfo = (Flags >> 1) & 0x07; HFI.IndexHeaderMapHeader = Flags & 0x01; HFI.ControllingMacroID = Reader.getGlobalIdentifierID( - M, endian::readNext(d)); + M, endian::readNext(d)); if (unsigned FrameworkOffset = - endian::readNext(d)) { + endian::readNext(d)) { // The framework offset is 1 greater than the actual offset, // since 0 is used as an indicator for "no framework name". StringRef FrameworkName(FrameworkStrings + FrameworkOffset - 1); @@ -2077,7 +2065,7 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d, "Wrong data length in HeaderFileInfo deserialization"); while (d != End) { uint32_t LocalSMID = - endian::readNext(d); + endian::readNext(d); auto HeaderRole = static_cast(LocalSMID & 7); LocalSMID >>= 3; @@ -4085,9 +4073,8 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const { // how it goes... using namespace llvm::support; ModuleKind Kind = static_cast( - endian::readNext(Data)); - uint16_t Len = - endian::readNext(Data); + endian::readNext(Data)); + uint16_t Len = endian::readNext(Data); StringRef Name = StringRef((const char*)Data, Len); Data += Len; ModuleFile *OM = (Kind == MK_PrebuiltModule || Kind == MK_ExplicitModule || @@ -4103,21 +4090,21 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const { } SourceLocation::UIntTy SLocOffset = - endian::readNext(Data); + endian::readNext(Data); uint32_t IdentifierIDOffset = - endian::readNext(Data); + endian::readNext(Data); uint32_t MacroIDOffset = - endian::readNext(Data); + endian::readNext(Data); uint32_t PreprocessedEntityIDOffset = - endian::readNext(Data); + endian::readNext(Data); uint32_t SubmoduleIDOffset = - endian::readNext(Data); + endian::readNext(Data); uint32_t SelectorIDOffset = - endian::readNext(Data); + endian::readNext(Data); uint32_t DeclIDOffset = - endian::readNext(Data); + endian::readNext(Data); uint32_t TypeIndexOffset = - endian::readNext(Data); + endian::readNext(Data); auto mapOffset = [&](uint32_t Offset, uint32_t BaseOffset, RemapBuilder &Remap) { diff --git a/clang/lib/Serialization/GlobalModuleIndex.cpp b/clang/lib/Serialization/GlobalModuleIndex.cpp index dd4fc3e009050f..8ff10f6a8621e8 100644 --- a/clang/lib/Serialization/GlobalModuleIndex.cpp +++ b/clang/lib/Serialization/GlobalModuleIndex.cpp @@ -89,10 +89,8 @@ class IdentifierIndexReaderTrait { static std::pair ReadKeyDataLength(const unsigned char*& d) { using namespace llvm::support; - unsigned KeyLen = - endian::readNext(d); - unsigned DataLen = - endian::readNext(d); + unsigned KeyLen = endian::readNext(d); + unsigned DataLen = endian::readNext(d); return std::make_pair(KeyLen, DataLen); } @@ -113,8 +111,7 @@ class IdentifierIndexReaderTrait { data_type Result; while (DataLen > 0) { - unsigned ID = - endian::readNext(d); + unsigned ID = endian::readNext(d); Result.push_back(ID); DataLen -= 4; } @@ -514,8 +511,7 @@ namespace { // The first bit indicates whether this identifier is interesting. // That's all we care about. using namespace llvm::support; - unsigned RawID = - endian::readNext(d); + unsigned RawID = endian::readNext(d); bool IsInteresting = RawID & 0x01; return std::make_pair(k, IsInteresting); } diff --git a/clang/lib/Serialization/MultiOnDiskHashTable.h b/clang/lib/Serialization/MultiOnDiskHashTable.h index 2402a628b512fb..a0d75ec3a9e76e 100644 --- a/clang/lib/Serialization/MultiOnDiskHashTable.h +++ b/clang/lib/Serialization/MultiOnDiskHashTable.h @@ -200,11 +200,11 @@ template class MultiOnDiskHashTable { storage_type Ptr = Data; uint32_t BucketOffset = - endian::readNext(Ptr); + endian::readNext(Ptr); // Read the list of overridden files. uint32_t NumFiles = - endian::readNext(Ptr); + endian::readNext(Ptr); // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make // an additional copy. llvm::SmallVector OverriddenFiles; From a169d4c2e974ceb20b86faea3fa4ac286a1d44e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 16 Apr 2024 10:37:15 +0300 Subject: [PATCH 043/300] =?UTF-8?q?[LLD]=C2=A0[COFF]=20Error=20out=20if=20?= =?UTF-8?q?the=20runtime=20pseudo=20relocation=20function=20is=20missing?= =?UTF-8?q?=20(#88573)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When then linker creates runtime pseudo relocations, it places them in a list with the assumption that the runtime will fix these relocations later, when the image gets loaded. If the relevant runtime function doesn't seem to be present in the linked image, error out. Normally when linking the mingw-w64 runtime libraries, this function always is available. However, if linking without including the mingw-w64 CRT startup files, and the image needs runtime pseudo relocations, make it clear that this won't work as expected at runtime. With ld.bfd, this situation is a hard error too; ld.bfd adds an undefined reference to this symbol if runtime pseudo relocations are needed. A later alternative would be to actually try to pull in the symbol (if seen in a static library, but not included yet). This would allow decoupling the function from the main mingw-w64 CRT startup code (making it optional, only running if the linker actually produced runtime pseudo relocations). Doing that would require restructuring the lld code (gathering pseudo relocations earlier, then loading the relocator function, then pulling in more object files to satisfy the dependencies of the relocator) though. Also, ld.bfd doesn't currently successfully pull in more object files to satisfy the dependency on _pei386_runtime_relocator, so with that in mind, there's not much extra value in making LLD do it currently either; we can't make such a change in mingw-w64's CRT until both linkers handle it. This fixes one issue brought up in https://github.com/llvm/llvm-project/issues/84424. --- lld/COFF/Writer.cpp | 10 ++++++- lld/test/COFF/autoimport-arm-data.s | 3 +++ lld/test/COFF/autoimport-arm64-data.s | 3 +++ lld/test/COFF/autoimport-gnu-implib.s | 3 +++ lld/test/COFF/autoimport-handler-func.s | 36 +++++++++++++++++++++++++ lld/test/COFF/autoimport-warn.s | 3 +++ lld/test/COFF/autoimport-x86.s | 3 +++ 7 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 lld/test/COFF/autoimport-handler-func.s diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 9c20bbb83d86d1..7269d156752de8 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -2072,8 +2072,16 @@ void Writer::createRuntimePseudoRelocs() { return; } - if (!rels.empty()) + if (!rels.empty()) { log("Writing " + Twine(rels.size()) + " runtime pseudo relocations"); + const char *symbolName = "_pei386_runtime_relocator"; + Symbol *relocator = ctx.symtab.findUnderscore(symbolName); + if (!relocator) + error("output image has runtime pseudo relocations, but the function " + + Twine(symbolName) + + " is missing; it is needed for fixing the relocations at runtime"); + } + PseudoRelocTableChunk *table = make(rels); rdataSec->addChunk(table); EmptyChunk *endOfList = make(); diff --git a/lld/test/COFF/autoimport-arm-data.s b/lld/test/COFF/autoimport-arm-data.s index 74604aa5c82343..82c66f0989d490 100644 --- a/lld/test/COFF/autoimport-arm-data.s +++ b/lld/test/COFF/autoimport-arm-data.s @@ -33,6 +33,9 @@ .text .thumb main: + bx lr + .global _pei386_runtime_relocator +_pei386_runtime_relocator: bx lr .data ptr: diff --git a/lld/test/COFF/autoimport-arm64-data.s b/lld/test/COFF/autoimport-arm64-data.s index fa3654be3a71d7..b49bd4f89c97c2 100644 --- a/lld/test/COFF/autoimport-arm64-data.s +++ b/lld/test/COFF/autoimport-arm64-data.s @@ -33,6 +33,9 @@ .global main .text main: + ret + .global _pei386_runtime_relocator +_pei386_runtime_relocator: ret .data ptr: diff --git a/lld/test/COFF/autoimport-gnu-implib.s b/lld/test/COFF/autoimport-gnu-implib.s index d7d4ed626e83ae..d9dc9d7a38fdc3 100644 --- a/lld/test/COFF/autoimport-gnu-implib.s +++ b/lld/test/COFF/autoimport-gnu-implib.s @@ -27,5 +27,8 @@ .text main: movl data(%rip), %eax + ret + .global _pei386_runtime_relocator +_pei386_runtime_relocator: ret .data diff --git a/lld/test/COFF/autoimport-handler-func.s b/lld/test/COFF/autoimport-handler-func.s new file mode 100644 index 00000000000000..02d040bfa274ce --- /dev/null +++ b/lld/test/COFF/autoimport-handler-func.s @@ -0,0 +1,36 @@ +# REQUIRES: x86 +# RUN: split-file %s %t.dir + +# RUN: llvm-dlltool -m i386:x86-64 -d %t.dir/lib.def -D lib.dll -l %t.dir/lib.lib + +# RUN: llvm-mc -triple=x86_64-windows-gnu %t.dir/main.s -filetype=obj -o %t.dir/main.obj +# RUN: llvm-mc -triple=x86_64-windows-gnu %t.dir/func.s -filetype=obj -o %t.dir/func.obj +# RUN: env LLD_IN_TEST=1 not lld-link -lldmingw -out:%t.dir/main.exe -entry:main %t.dir/main.obj %t.dir/lib.lib 2>&1 | FileCheck %s --check-prefix=ERR + +# RUN: lld-link -lldmingw -out:%t.dir/main.exe -entry:main %t.dir/main.obj %t.dir/func.obj %t.dir/lib.lib 2>&1 | FileCheck %s --check-prefix=NOERR --allow-empty + +# ERR: error: output image has runtime pseudo relocations, but the function _pei386_runtime_relocator is missing; it is needed for fixing the relocations at runtime + +# NOERR-NOT: error + +#--- main.s + .global main + .text +main: + ret + + .data + .long 1 + .quad variable + .long 2 + +#--- func.s + .global _pei386_runtime_relocator + .text +_pei386_runtime_relocator: + ret + +#--- lib.def +EXPORTS +variable DATA + diff --git a/lld/test/COFF/autoimport-warn.s b/lld/test/COFF/autoimport-warn.s index 9c363ed30f2459..eead0fed861f8f 100644 --- a/lld/test/COFF/autoimport-warn.s +++ b/lld/test/COFF/autoimport-warn.s @@ -18,6 +18,9 @@ main: movl variable2(%rip), %ecx addl %ecx, %eax ret + .global _pei386_runtime_relocator +_pei386_runtime_relocator: + ret .section .rdata$.refptr.variable1,"dr",discard,.refptr.variable1 .global .refptr.variable1 diff --git a/lld/test/COFF/autoimport-x86.s b/lld/test/COFF/autoimport-x86.s index fa36f10e9ca912..5d7c9c2c3fa580 100644 --- a/lld/test/COFF/autoimport-x86.s +++ b/lld/test/COFF/autoimport-x86.s @@ -55,6 +55,9 @@ .text main: movl variable(%rip), %eax + ret + .global _pei386_runtime_relocator +_pei386_runtime_relocator: ret .data ptr: From dbaa1893c9afe6a245860efb8d68875ba4fd6794 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Tue, 16 Apr 2024 15:37:31 +0800 Subject: [PATCH 044/300] [RISCV] Generate more W instructons We rename `TuneNoStripWSuffix` to `TunePreferWInst`. If all the users of an instruction just use the low 32 bits, we can convert it to its W variant. A quick test on Coremark (`-O3 -march=rv64gc`): | | W instructions | code size(.text) | |--------|----------------|------------------| | before | 302 | 12257 | | after | 343 | 12265 | | | +13.58% | +0.065% | Reviewers: asb, dtcxzyw, preames, lukel97, michaelmaitland, topperc Reviewed By: topperc, dtcxzyw Pull Request: https://github.com/llvm/llvm-project/pull/87237 --- llvm/lib/Target/RISCV/RISCVFeatures.td | 6 +- llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp | 89 +++++++- llvm/test/CodeGen/RISCV/prefer-w-inst.ll | 105 +++++++++ llvm/test/CodeGen/RISCV/prefer-w-inst.mir | 262 ++++++++++++++++++++++ llvm/test/CodeGen/RISCV/strip-w-suffix.ll | 74 ------ 5 files changed, 447 insertions(+), 89 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/prefer-w-inst.ll create mode 100644 llvm/test/CodeGen/RISCV/prefer-w-inst.mir delete mode 100644 llvm/test/CodeGen/RISCV/strip-w-suffix.ll diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 794455aa730400..59962216e0c041 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1226,9 +1226,9 @@ def TuneNoSinkSplatOperands "false", "Disable sink splat operands to enable .vx, .vf," ".wx, and .wf instructions">; -def TuneNoStripWSuffix - : SubtargetFeature<"no-strip-w-suffix", "EnableStripWSuffix", "false", - "Disable strip W suffix">; +def TunePreferWInst + : SubtargetFeature<"prefer-w-inst", "PreferWInst", "true", + "Prefer instructions with W suffix">; def TuneConditionalCompressedMoveFusion : SubtargetFeature<"conditional-cmv-fusion", "HasConditionalCompressedMoveFusion", diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp index 39d420c2fbf080..ead91c5656be8b 100644 --- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp +++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp @@ -12,15 +12,24 @@ // extended bits aren't consumed or because the input was already sign extended // by an earlier instruction. // -// Then it removes the -w suffix from opw instructions whenever all users are -// dependent only on the lower word of the result of the instruction. -// The cases handled are: -// * addw because c.add has a larger register encoding than c.addw. -// * addiw because it helps reduce test differences between RV32 and RV64 -// w/o being a pessimization. -// * mulw because c.mulw doesn't exist but c.mul does (w/ zcb) -// * slliw because c.slliw doesn't exist and c.slli does +// Then: +// 1. Unless explicit disabled or the target prefers instructions with W suffix, +// it removes the -w suffix from opw instructions whenever all users are +// dependent only on the lower word of the result of the instruction. +// The cases handled are: +// * addw because c.add has a larger register encoding than c.addw. +// * addiw because it helps reduce test differences between RV32 and RV64 +// w/o being a pessimization. +// * mulw because c.mulw doesn't exist but c.mul does (w/ zcb) +// * slliw because c.slliw doesn't exist and c.slli does // +// 2. Or if explicit enabled or the target prefers instructions with W suffix, +// it adds the W suffix to the instruction whenever all users are dependent +// only on the lower word of the result of the instruction. +// The cases handled are: +// * add/addi/sub/mul. +// * slli with imm < 32. +// * ld/lwu. //===---------------------------------------------------------------------===// #include "RISCV.h" @@ -60,6 +69,8 @@ class RISCVOptWInstrs : public MachineFunctionPass { const RISCVSubtarget &ST, MachineRegisterInfo &MRI); bool stripWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII, const RISCVSubtarget &ST, MachineRegisterInfo &MRI); + bool appendWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII, + const RISCVSubtarget &ST, MachineRegisterInfo &MRI); void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -672,9 +683,6 @@ bool RISCVOptWInstrs::stripWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII, const RISCVSubtarget &ST, MachineRegisterInfo &MRI) { - if (DisableStripWSuffix || !ST.enableStripWSuffix()) - return false; - bool MadeChange = false; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { @@ -698,6 +706,58 @@ bool RISCVOptWInstrs::stripWSuffixes(MachineFunction &MF, return MadeChange; } +bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF, + const RISCVInstrInfo &TII, + const RISCVSubtarget &ST, + MachineRegisterInfo &MRI) { + bool MadeChange = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + unsigned WOpc; + // TODO: Add more? + switch (MI.getOpcode()) { + default: + continue; + case RISCV::ADD: + WOpc = RISCV::ADDW; + break; + case RISCV::ADDI: + WOpc = RISCV::ADDIW; + break; + case RISCV::SUB: + WOpc = RISCV::SUBW; + break; + case RISCV::MUL: + WOpc = RISCV::MULW; + break; + case RISCV::SLLI: + // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits + if (MI.getOperand(2).getImm() >= 32) + continue; + WOpc = RISCV::SLLIW; + break; + case RISCV::LD: + case RISCV::LWU: + WOpc = RISCV::LW; + break; + } + + if (hasAllWUsers(MI, ST, MRI)) { + LLVM_DEBUG(dbgs() << "Replacing " << MI); + MI.setDesc(TII.get(WOpc)); + MI.clearFlag(MachineInstr::MIFlag::NoSWrap); + MI.clearFlag(MachineInstr::MIFlag::NoUWrap); + MI.clearFlag(MachineInstr::MIFlag::IsExact); + LLVM_DEBUG(dbgs() << " with " << MI); + ++NumTransformedToWInstrs; + MadeChange = true; + } + } + } + + return MadeChange; +} + bool RISCVOptWInstrs::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -711,7 +771,12 @@ bool RISCVOptWInstrs::runOnMachineFunction(MachineFunction &MF) { bool MadeChange = false; MadeChange |= removeSExtWInstrs(MF, TII, ST, MRI); - MadeChange |= stripWSuffixes(MF, TII, ST, MRI); + + if (!(DisableStripWSuffix || ST.preferWInst())) + MadeChange |= stripWSuffixes(MF, TII, ST, MRI); + + if (ST.preferWInst()) + MadeChange |= appendWSuffixes(MF, TII, ST, MRI); return MadeChange; } diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.ll b/llvm/test/CodeGen/RISCV/prefer-w-inst.ll new file mode 100644 index 00000000000000..34ab74d78a76fb --- /dev/null +++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=NO-PREFER-W-INST %s +; RUN: llc -mtriple=riscv64 -mattr=+m -riscv-disable-strip-w-suffix -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=NO-STRIP %s +; RUN: llc -mtriple=riscv64 -mattr=+m,+prefer-w-inst -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=PREFER-W-INST %s + +define i32 @addiw(i32 %a) { +; NO-PREFER-W-INST-LABEL: addiw: +; NO-PREFER-W-INST: # %bb.0: +; NO-PREFER-W-INST-NEXT: lui a1, 1 +; NO-PREFER-W-INST-NEXT: addi a1, a1, -1 +; NO-PREFER-W-INST-NEXT: addw a0, a0, a1 +; NO-PREFER-W-INST-NEXT: ret +; +; NO-STRIP-LABEL: addiw: +; NO-STRIP: # %bb.0: +; NO-STRIP-NEXT: lui a1, 1 +; NO-STRIP-NEXT: addiw a1, a1, -1 +; NO-STRIP-NEXT: addw a0, a0, a1 +; NO-STRIP-NEXT: ret +; +; PREFER-W-INST-LABEL: addiw: +; PREFER-W-INST: # %bb.0: +; PREFER-W-INST-NEXT: lui a1, 1 +; PREFER-W-INST-NEXT: addiw a1, a1, -1 +; PREFER-W-INST-NEXT: addw a0, a0, a1 +; PREFER-W-INST-NEXT: ret + %ret = add i32 %a, 4095 + ret i32 %ret +} + +define i32 @addw(i32 %a, i32 %b) { +; NO-PREFER-W-INST-LABEL: addw: +; NO-PREFER-W-INST: # %bb.0: +; NO-PREFER-W-INST-NEXT: add a0, a0, a1 +; NO-PREFER-W-INST-NEXT: addiw a0, a0, 1024 +; NO-PREFER-W-INST-NEXT: ret +; +; NO-STRIP-LABEL: addw: +; NO-STRIP: # %bb.0: +; NO-STRIP-NEXT: addw a0, a0, a1 +; NO-STRIP-NEXT: addiw a0, a0, 1024 +; NO-STRIP-NEXT: ret +; +; PREFER-W-INST-LABEL: addw: +; PREFER-W-INST: # %bb.0: +; PREFER-W-INST-NEXT: addw a0, a0, a1 +; PREFER-W-INST-NEXT: addiw a0, a0, 1024 +; PREFER-W-INST-NEXT: ret + %add = add i32 %a, %b + %ret = add i32 %add, 1024 + ret i32 %ret +} + +define i32 @mulw(i32 %a, i32 %b) { +; NO-PREFER-W-INST-LABEL: mulw: +; NO-PREFER-W-INST: # %bb.0: +; NO-PREFER-W-INST-NEXT: mul a1, a0, a1 +; NO-PREFER-W-INST-NEXT: mul a0, a0, a1 +; NO-PREFER-W-INST-NEXT: addiw a0, a0, 1024 +; NO-PREFER-W-INST-NEXT: ret +; +; NO-STRIP-LABEL: mulw: +; NO-STRIP: # %bb.0: +; NO-STRIP-NEXT: mulw a1, a0, a1 +; NO-STRIP-NEXT: mulw a0, a0, a1 +; NO-STRIP-NEXT: addiw a0, a0, 1024 +; NO-STRIP-NEXT: ret +; +; PREFER-W-INST-LABEL: mulw: +; PREFER-W-INST: # %bb.0: +; PREFER-W-INST-NEXT: mulw a1, a0, a1 +; PREFER-W-INST-NEXT: mulw a0, a0, a1 +; PREFER-W-INST-NEXT: addiw a0, a0, 1024 +; PREFER-W-INST-NEXT: ret + %mul1 = mul i32 %a, %b + %mul = mul i32 %a, %mul1 + %ret = add i32 %mul, 1024 + ret i32 %ret +} + +define i32 @slliw(i32 %a) { +; NO-PREFER-W-INST-LABEL: slliw: +; NO-PREFER-W-INST: # %bb.0: +; NO-PREFER-W-INST-NEXT: slli a0, a0, 1 +; NO-PREFER-W-INST-NEXT: addiw a0, a0, 1024 +; NO-PREFER-W-INST-NEXT: ret +; +; NO-STRIP-LABEL: slliw: +; NO-STRIP: # %bb.0: +; NO-STRIP-NEXT: slliw a0, a0, 1 +; NO-STRIP-NEXT: addiw a0, a0, 1024 +; NO-STRIP-NEXT: ret +; +; PREFER-W-INST-LABEL: slliw: +; PREFER-W-INST: # %bb.0: +; PREFER-W-INST-NEXT: slliw a0, a0, 1 +; PREFER-W-INST-NEXT: addiw a0, a0, 1024 +; PREFER-W-INST-NEXT: ret + %shl = shl i32 %a, 1 + %ret = add i32 %shl, 1024 + ret i32 %ret +} diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir new file mode 100644 index 00000000000000..e05e27af4271c1 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir @@ -0,0 +1,262 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc %s -mtriple=riscv64 -run-pass=riscv-opt-w-instrs -verify-machineinstrs \ +# RUN: -mattr=+m -o - | FileCheck %s -check-prefixes=NO-PREFER-W-INST +# RUN: llc %s -mtriple=riscv64 -run-pass=riscv-opt-w-instrs -verify-machineinstrs \ +# RUN: -mattr=+m,+prefer-w-inst -o - | FileCheck %s -check-prefixes=PREFER-W-INST + +--- +name: addi +body: | + bb.0.entry: + liveins: $x10, $x11 + ; NO-PREFER-W-INST-LABEL: name: addi + ; NO-PREFER-W-INST: liveins: $x10, $x11 + ; NO-PREFER-W-INST-NEXT: {{ $}} + ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; NO-PREFER-W-INST-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY]], 1 + ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADDI]], 1 + ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; NO-PREFER-W-INST-NEXT: PseudoRET + ; + ; PREFER-W-INST-LABEL: name: addi + ; PREFER-W-INST: liveins: $x10, $x11 + ; PREFER-W-INST-NEXT: {{ $}} + ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[COPY]], 1 + ; PREFER-W-INST-NEXT: [[ADDIW1:%[0-9]+]]:gpr = ADDIW [[ADDIW]], 1 + ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW1]] + ; PREFER-W-INST-NEXT: PseudoRET + %1:gpr = COPY $x10 + %2:gpr = COPY $x11 + %3:gpr = ADDI %1, 1 + %4:gpr = ADDIW %3, 1 + $x10 = COPY %4 + PseudoRET +... + +--- +name: add +body: | + bb.0.entry: + liveins: $x10, $x11 + ; NO-PREFER-W-INST-LABEL: name: add + ; NO-PREFER-W-INST: liveins: $x10, $x11 + ; NO-PREFER-W-INST-NEXT: {{ $}} + ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; NO-PREFER-W-INST-NEXT: [[ADD:%[0-9]+]]:gpr = ADD [[COPY]], [[COPY1]] + ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADD]], 1 + ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; NO-PREFER-W-INST-NEXT: PseudoRET + ; + ; PREFER-W-INST-LABEL: name: add + ; PREFER-W-INST: liveins: $x10, $x11 + ; PREFER-W-INST-NEXT: {{ $}} + ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; PREFER-W-INST-NEXT: [[ADDW:%[0-9]+]]:gpr = ADDW [[COPY]], [[COPY1]] + ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADDW]], 1 + ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; PREFER-W-INST-NEXT: PseudoRET + %1:gpr = COPY $x10 + %2:gpr = COPY $x11 + %3:gpr = ADD %1, %2 + %4:gpr = ADDIW %3, 1 + $x10 = COPY %4 + PseudoRET +... + +--- +name: sub +body: | + bb.0.entry: + liveins: $x10, $x11 + ; NO-PREFER-W-INST-LABEL: name: sub + ; NO-PREFER-W-INST: liveins: $x10, $x11 + ; NO-PREFER-W-INST-NEXT: {{ $}} + ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; NO-PREFER-W-INST-NEXT: [[SUB:%[0-9]+]]:gpr = SUB [[COPY]], [[COPY1]] + ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SUB]], 1 + ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; NO-PREFER-W-INST-NEXT: PseudoRET + ; + ; PREFER-W-INST-LABEL: name: sub + ; PREFER-W-INST: liveins: $x10, $x11 + ; PREFER-W-INST-NEXT: {{ $}} + ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; PREFER-W-INST-NEXT: [[SUBW:%[0-9]+]]:gpr = SUBW [[COPY]], [[COPY1]] + ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SUBW]], 1 + ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; PREFER-W-INST-NEXT: PseudoRET + %1:gpr = COPY $x10 + %2:gpr = COPY $x11 + %3:gpr = SUB %1, %2 + %4:gpr = ADDIW %3, 1 + $x10 = COPY %4 + PseudoRET +... + +--- +name: mul +body: | + bb.0.entry: + liveins: $x10, $x11 + ; NO-PREFER-W-INST-LABEL: name: mul + ; NO-PREFER-W-INST: liveins: $x10, $x11 + ; NO-PREFER-W-INST-NEXT: {{ $}} + ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; NO-PREFER-W-INST-NEXT: [[MUL:%[0-9]+]]:gpr = MUL [[COPY]], [[COPY1]] + ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[MUL]], 1 + ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; NO-PREFER-W-INST-NEXT: PseudoRET + ; + ; PREFER-W-INST-LABEL: name: mul + ; PREFER-W-INST: liveins: $x10, $x11 + ; PREFER-W-INST-NEXT: {{ $}} + ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; PREFER-W-INST-NEXT: [[MULW:%[0-9]+]]:gpr = MULW [[COPY]], [[COPY1]] + ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[MULW]], 1 + ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; PREFER-W-INST-NEXT: PseudoRET + %1:gpr = COPY $x10 + %2:gpr = COPY $x11 + %3:gpr = MUL %1, %2 + %4:gpr = ADDIW %3, 1 + $x10 = COPY %4 + PseudoRET +... + + +--- +name: slli_31 +body: | + bb.0.entry: + liveins: $x10, $x11 + ; NO-PREFER-W-INST-LABEL: name: slli_31 + ; NO-PREFER-W-INST: liveins: $x10, $x11 + ; NO-PREFER-W-INST-NEXT: {{ $}} + ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; NO-PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 31 + ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1 + ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; NO-PREFER-W-INST-NEXT: PseudoRET + ; + ; PREFER-W-INST-LABEL: name: slli_31 + ; PREFER-W-INST: liveins: $x10, $x11 + ; PREFER-W-INST-NEXT: {{ $}} + ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; PREFER-W-INST-NEXT: [[SLLIW:%[0-9]+]]:gpr = SLLIW [[COPY]], 31 + ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLIW]], 1 + ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; PREFER-W-INST-NEXT: PseudoRET + %1:gpr = COPY $x10 + %2:gpr = COPY $x11 + %3:gpr = SLLI %1, 31 + %4:gpr = ADDIW %3, 1 + $x10 = COPY %4 + PseudoRET +... + +--- +name: slli_32 +body: | + bb.0.entry: + liveins: $x10, $x11 + ; NO-PREFER-W-INST-LABEL: name: slli_32 + ; NO-PREFER-W-INST: liveins: $x10, $x11 + ; NO-PREFER-W-INST-NEXT: {{ $}} + ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; NO-PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32 + ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1 + ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; NO-PREFER-W-INST-NEXT: PseudoRET + ; + ; PREFER-W-INST-LABEL: name: slli_32 + ; PREFER-W-INST: liveins: $x10, $x11 + ; PREFER-W-INST-NEXT: {{ $}} + ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32 + ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1 + ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; PREFER-W-INST-NEXT: PseudoRET + %1:gpr = COPY $x10 + %2:gpr = COPY $x11 + %3:gpr = SLLI %1, 32 + %4:gpr = ADDIW %3, 1 + $x10 = COPY %4 + PseudoRET +... + +--- +name: ld +body: | + bb.0.entry: + liveins: $x10, $x11 + ; NO-PREFER-W-INST-LABEL: name: ld + ; NO-PREFER-W-INST: liveins: $x10, $x11 + ; NO-PREFER-W-INST-NEXT: {{ $}} + ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; NO-PREFER-W-INST-NEXT: [[LD:%[0-9]+]]:gpr = LD [[COPY]], 0 + ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LD]], 1 + ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; NO-PREFER-W-INST-NEXT: PseudoRET + ; + ; PREFER-W-INST-LABEL: name: ld + ; PREFER-W-INST: liveins: $x10, $x11 + ; PREFER-W-INST-NEXT: {{ $}} + ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0 + ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1 + ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; PREFER-W-INST-NEXT: PseudoRET + %1:gpr = COPY $x10 + %2:gpr = COPY $x11 + %3:gpr = LD %1, 0 + %4:gpr = ADDIW %3, 1 + $x10 = COPY %4 + PseudoRET +... + +--- +name: lwu +body: | + bb.0.entry: + liveins: $x10, $x11 + ; NO-PREFER-W-INST-LABEL: name: lwu + ; NO-PREFER-W-INST: liveins: $x10, $x11 + ; NO-PREFER-W-INST-NEXT: {{ $}} + ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; NO-PREFER-W-INST-NEXT: [[LWU:%[0-9]+]]:gpr = LWU [[COPY]], 0 + ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LWU]], 1 + ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; NO-PREFER-W-INST-NEXT: PseudoRET + ; + ; PREFER-W-INST-LABEL: name: lwu + ; PREFER-W-INST: liveins: $x10, $x11 + ; PREFER-W-INST-NEXT: {{ $}} + ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 + ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 + ; PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0 + ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1 + ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]] + ; PREFER-W-INST-NEXT: PseudoRET + %1:gpr = COPY $x10 + %2:gpr = COPY $x11 + %3:gpr = LWU %1, 0 + %4:gpr = ADDIW %3, 1 + $x10 = COPY %4 + PseudoRET +... diff --git a/llvm/test/CodeGen/RISCV/strip-w-suffix.ll b/llvm/test/CodeGen/RISCV/strip-w-suffix.ll deleted file mode 100644 index 4124b3d0d360d2..00000000000000 --- a/llvm/test/CodeGen/RISCV/strip-w-suffix.ll +++ /dev/null @@ -1,74 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \ -; RUN: | FileCheck -check-prefixes=STRIP %s -; RUN: llc -mtriple=riscv64 -mattr=+m,+no-strip-w-suffix -verify-machineinstrs < %s \ -; RUN: | FileCheck -check-prefixes=NO-STRIP %s - -define i32 @addiw(i32 %a) { -; STRIP-LABEL: addiw: -; STRIP: # %bb.0: -; STRIP-NEXT: lui a1, 1 -; STRIP-NEXT: addi a1, a1, -1 -; STRIP-NEXT: addw a0, a0, a1 -; STRIP-NEXT: ret -; -; NO-STRIP-LABEL: addiw: -; NO-STRIP: # %bb.0: -; NO-STRIP-NEXT: lui a1, 1 -; NO-STRIP-NEXT: addiw a1, a1, -1 -; NO-STRIP-NEXT: addw a0, a0, a1 -; NO-STRIP-NEXT: ret - %ret = add i32 %a, 4095 - ret i32 %ret -} - -define i32 @addw(i32 %a, i32 %b) { -; STRIP-LABEL: addw: -; STRIP: # %bb.0: -; STRIP-NEXT: add a0, a0, a1 -; STRIP-NEXT: addiw a0, a0, 1024 -; STRIP-NEXT: ret -; -; NO-STRIP-LABEL: addw: -; NO-STRIP: # %bb.0: -; NO-STRIP-NEXT: addw a0, a0, a1 -; NO-STRIP-NEXT: addiw a0, a0, 1024 -; NO-STRIP-NEXT: ret - %add = add i32 %a, %b - %ret = add i32 %add, 1024 - ret i32 %ret -} - -define i32 @mulw(i32 %a, i32 %b) { -; STRIP-LABEL: mulw: -; STRIP: # %bb.0: -; STRIP-NEXT: mul a0, a0, a1 -; STRIP-NEXT: addiw a0, a0, 1024 -; STRIP-NEXT: ret -; -; NO-STRIP-LABEL: mulw: -; NO-STRIP: # %bb.0: -; NO-STRIP-NEXT: mulw a0, a0, a1 -; NO-STRIP-NEXT: addiw a0, a0, 1024 -; NO-STRIP-NEXT: ret - %mul = mul i32 %a, %b - %ret = add i32 %mul, 1024 - ret i32 %ret -} - -define i32 @slliw(i32 %a) { -; STRIP-LABEL: slliw: -; STRIP: # %bb.0: -; STRIP-NEXT: slli a0, a0, 1 -; STRIP-NEXT: addiw a0, a0, 1024 -; STRIP-NEXT: ret -; -; NO-STRIP-LABEL: slliw: -; NO-STRIP: # %bb.0: -; NO-STRIP-NEXT: slliw a0, a0, 1 -; NO-STRIP-NEXT: addiw a0, a0, 1024 -; NO-STRIP-NEXT: ret - %shl = shl i32 %a, 1 - %ret = add i32 %shl, 1024 - ret i32 %ret -} From d26dd58ca5b59032eb371b8f51d9134acdd8d3ad Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Tue, 16 Apr 2024 14:37:38 +0800 Subject: [PATCH 045/300] [StmtProfile] Don't profile the body of lambda expressions Close https://github.com/llvm/llvm-project/issues/87609 We tried to profile the body of the lambda expressions in https://reviews.llvm.org/D153957. But as the original comments show, it is indeed dangerous. After we tried to skip calculating the ODR hash values recently, we have fall into this trap twice. So in this patch, I choose to not profile the body of the lambda expression. The signature of the lambda is still profiled. --- clang/include/clang/AST/DeclBase.h | 10 ----- clang/include/clang/Serialization/ASTReader.h | 6 +++ clang/lib/AST/Decl.cpp | 2 +- clang/lib/AST/DeclBase.cpp | 5 --- clang/lib/AST/StmtProfile.cpp | 22 +++++++++- clang/lib/Serialization/ASTReader.cpp | 2 +- clang/lib/Serialization/ASTReaderDecl.cpp | 8 ++-- clang/lib/Serialization/ASTWriter.cpp | 2 +- clang/lib/Serialization/ASTWriterDecl.cpp | 8 ++-- .../hashing-decls-in-exprs-from-gmf-2.cppm | 44 +++++++++++++++++++ 10 files changed, 81 insertions(+), 28 deletions(-) create mode 100644 clang/test/Modules/hashing-decls-in-exprs-from-gmf-2.cppm diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 2194d268fa86f0..1079993f496945 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -672,16 +672,6 @@ class alignas(8) Decl { /// Whether this declaration comes from explicit global module. bool isFromExplicitGlobalModule() const; - /// Check if we should skip checking ODRHash for declaration \param D. - /// - /// The existing ODRHash mechanism seems to be not stable enough and - /// the false positive ODR violation reports are annoying and we rarely see - /// true ODR violation reports. Also we learned that MSVC disabled ODR checks - /// for declarations in GMF. So we try to disable ODR checks in the GMF to - /// get better user experiences before we make the ODR violation checks stable - /// enough. - bool shouldSkipCheckingODR() const; - /// Return true if this declaration has an attribute which acts as /// definition of the entity, such as 'alias' or 'ifunc'. bool hasDefiningAttr() const; diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index e3fde887f99cb7..43ee06c524b3a0 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -2457,6 +2457,12 @@ class BitsUnpacker { uint32_t Value; uint32_t CurrentBitsIndex = ~0; }; + +inline bool shouldSkipCheckingODR(const Decl *D) { + return D->getASTContext().getLangOpts().SkipODRCheckInGMF && + D->isFromExplicitGlobalModule(); +} + } // namespace clang #endif // LLVM_CLANG_SERIALIZATION_ASTREADER_H diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 2b2d5a2663a18b..33b6f8611f2162 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -4534,7 +4534,7 @@ unsigned FunctionDecl::getODRHash() { } class ODRHash Hash; - Hash.AddFunctionDecl(this, /*SkipBody=*/shouldSkipCheckingODR()); + Hash.AddFunctionDecl(this); setHasODRHash(true); ODRHash = Hash.CalculateHash(); return ODRHash; diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index 66a727d9dd0c39..434926324c96ca 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -1106,11 +1106,6 @@ bool Decl::isFromExplicitGlobalModule() const { return getOwningModule() && getOwningModule()->isExplicitGlobalModule(); } -bool Decl::shouldSkipCheckingODR() const { - return getASTContext().getLangOpts().SkipODRCheckInGMF && - isFromExplicitGlobalModule(); -} - static Decl::Kind getKind(const Decl *D) { return D->getKind(); } static Decl::Kind getKind(const DeclContext *DC) { return DC->getDeclKind(); } diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index d2aac1e640380f..789e4634bd293b 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -2071,13 +2071,31 @@ StmtProfiler::VisitLambdaExpr(const LambdaExpr *S) { } CXXRecordDecl *Lambda = S->getLambdaClass(); - ID.AddInteger(Lambda->getODRHash()); - for (const auto &Capture : Lambda->captures()) { ID.AddInteger(Capture.getCaptureKind()); if (Capture.capturesVariable()) VisitDecl(Capture.getCapturedVar()); } + + // Profiling the body of the lambda may be dangerous during deserialization. + // So we'd like only to profile the signature here. + ODRHash Hasher; + // FIXME: We can't get the operator call easily by + // `CXXRecordDecl::getLambdaCallOperator()` if we're in deserialization. + // So we have to do something raw here. + for (auto *SubDecl : Lambda->decls()) { + FunctionDecl *Call = nullptr; + if (auto *FTD = dyn_cast(SubDecl)) + Call = FTD->getTemplatedDecl(); + else if (auto *FD = dyn_cast(SubDecl)) + Call = FD; + + if (!Call) + continue; + + Hasher.AddFunctionDecl(Call, /*SkipBody=*/true); + } + ID.AddInteger(Hasher.CalculateHash()); } void diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index feb60bc54413a5..f47d540ea4b86d 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -9785,7 +9785,7 @@ void ASTReader::finishPendingActions() { !NonConstDefn->isLateTemplateParsed() && // We only perform ODR checks for decls not in the explicit // global module fragment. - !FD->shouldSkipCheckingODR() && + !shouldSkipCheckingODR(FD) && FD->getODRHash() != NonConstDefn->getODRHash()) { if (!isa(FD)) { PendingFunctionOdrMergeFailures[FD].push_back(NonConstDefn); diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index e4b6a75c118ba3..74d40f7da34cad 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -826,7 +826,7 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) { Reader.mergeDefinitionVisibility(OldDef, ED); // We don't want to check the ODR hash value for declarations from global // module fragment. - if (!ED->shouldSkipCheckingODR() && + if (!shouldSkipCheckingODR(ED) && OldDef->getODRHash() != ED->getODRHash()) Reader.PendingEnumOdrMergeFailures[OldDef].push_back(ED); } else { @@ -868,7 +868,7 @@ void ASTDeclReader::VisitRecordDecl(RecordDecl *RD) { VisitRecordDeclImpl(RD); // We should only reach here if we're in C/Objective-C. There is no // global module fragment. - assert(!RD->shouldSkipCheckingODR()); + assert(!shouldSkipCheckingODR(RD)); RD->setODRHash(Record.readInt()); // Maintain the invariant of a redeclaration chain containing only @@ -2155,7 +2155,7 @@ void ASTDeclReader::MergeDefinitionData( } // We don't want to check ODR for decls in the global module fragment. - if (MergeDD.Definition->shouldSkipCheckingODR()) + if (shouldSkipCheckingODR(MergeDD.Definition)) return; if (D->getODRHash() != MergeDD.ODRHash) { @@ -3530,7 +3530,7 @@ ASTDeclReader::FindExistingResult ASTDeclReader::findExisting(NamedDecl *D) { // same template specialization into the same CXXRecordDecl. auto MergedDCIt = Reader.MergedDeclContexts.find(D->getLexicalDeclContext()); if (MergedDCIt != Reader.MergedDeclContexts.end() && - !D->shouldSkipCheckingODR() && MergedDCIt->second == D->getDeclContext()) + !shouldSkipCheckingODR(D) && MergedDCIt->second == D->getDeclContext()) Reader.PendingOdrMergeChecks.push_back(D); return FindExistingResult(Reader, D, /*Existing=*/nullptr, diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 85b7fd5535a1bf..ce6fa1feb1eeb3 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -6188,7 +6188,7 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) { BitsPacker DefinitionBits; - bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR(); + bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D); DefinitionBits.addBit(ShouldSkipCheckingODR); #define FIELD(Name, Width, Merge) \ diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 276b6257f1d841..d0d49bcdf991a9 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -526,7 +526,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) { BitsPacker EnumDeclBits; EnumDeclBits.addBits(D->getNumPositiveBits(), /*BitWidth=*/8); EnumDeclBits.addBits(D->getNumNegativeBits(), /*BitWidth=*/8); - bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR(); + bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D); EnumDeclBits.addBit(ShouldSkipCheckingODR); EnumDeclBits.addBit(D->isScoped()); EnumDeclBits.addBit(D->isScopedUsingClassTag()); @@ -552,7 +552,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) { !D->isTopLevelDeclInObjCContainer() && !CXXRecordDecl::classofKind(D->getKind()) && !D->getIntegerTypeSourceInfo() && !D->getMemberSpecializationInfo() && - !needsAnonymousDeclarationNumber(D) && !D->shouldSkipCheckingODR() && + !needsAnonymousDeclarationNumber(D) && !shouldSkipCheckingODR(D) && D->getDeclName().getNameKind() == DeclarationName::Identifier) AbbrevToUse = Writer.getDeclEnumAbbrev(); @@ -718,7 +718,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) { // FIXME: stable encoding FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3); FunctionDeclBits.addBits((uint32_t)D->getStorageClass(), /*BitWidth=*/3); - bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR(); + bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D); FunctionDeclBits.addBit(ShouldSkipCheckingODR); FunctionDeclBits.addBit(D->isInlineSpecified()); FunctionDeclBits.addBit(D->isInlined()); @@ -1559,7 +1559,7 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) { D->getFirstDecl() == D->getMostRecentDecl() && !D->isInvalidDecl() && !D->hasAttrs() && !D->isTopLevelDeclInObjCContainer() && D->getDeclName().getNameKind() == DeclarationName::Identifier && - !D->shouldSkipCheckingODR() && !D->hasExtInfo() && + !shouldSkipCheckingODR(D) && !D->hasExtInfo() && !D->isExplicitlyDefaulted()) { if (D->getTemplatedKind() == FunctionDecl::TK_NonTemplate || D->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate || diff --git a/clang/test/Modules/hashing-decls-in-exprs-from-gmf-2.cppm b/clang/test/Modules/hashing-decls-in-exprs-from-gmf-2.cppm new file mode 100644 index 00000000000000..66143102cb9e40 --- /dev/null +++ b/clang/test/Modules/hashing-decls-in-exprs-from-gmf-2.cppm @@ -0,0 +1,44 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/A.cppm -emit-module-interface -o %t/A.pcm +// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/test.cpp -fprebuilt-module-path=%t -fsyntax-only -verify + +//--- header.h +#pragma once +template +class Optional {}; + +template +concept C = requires(const _Tp& __t) { + [](const Optional<_Up>&) {}(__t); +}; + +//--- func.h +#include "header.h" +template +void func() {} + +//--- test_func.h +#include "func.h" + +inline void test_func() { + func>(); +} + +//--- A.cppm +module; +#include "header.h" +#include "test_func.h" +export module A; +export using ::test_func; + +//--- test.cpp +// expected-no-diagnostics +import A; +#include "test_func.h" + +void test() { + test_func(); +} From a29e85d6626aae7dba44c66ee5b703d3ab5de41c Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 16 Apr 2024 07:41:42 +0000 Subject: [PATCH 046/300] [llvm-exegesis] Change preprocessor directives for getCurrentTID This patch changes the preprocessor directives surrounding getCurrentTID, particularly moving it out of the block that is only defined when not building for Android. The getCurrentTID function is called in places that only require Linux definitions, so this function should have the same preprocessor scoping around it to prevent link time failures. --- llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp index 0a947f6e206fef..4699fbbea5def0 100644 --- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp +++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp @@ -22,7 +22,7 @@ namespace llvm { namespace exegesis { -#if defined(__linux__) && !defined(__ANDROID__) +#if defined(__linux__) long SubprocessMemory::getCurrentTID() { // We're using the raw syscall here rather than the gettid() function provided @@ -31,6 +31,8 @@ long SubprocessMemory::getCurrentTID() { return syscall(SYS_gettid); } +#if !defined(__ANDROID__) + Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) { // Add the PID to the shared memory name so that if we're running multiple // processes at the same time, they won't interfere with each other. @@ -157,7 +159,8 @@ Expected SubprocessMemory::setupAuxiliaryMemoryInSubprocess( SubprocessMemory::~SubprocessMemory() {} -#endif // defined(__linux__) && !defined(__ANDROID__) +#endif // !defined(__ANDROID__) +#endif // defined(__linux__) } // namespace exegesis } // namespace llvm From d34a2c2adb2a4f1dc262c5756d3725caa4ea2571 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Tue, 16 Apr 2024 15:55:14 +0800 Subject: [PATCH 047/300] [RISCV] Make more vector pseudos commutable This PR includes: * vadd.vv/vand.vv/vor.vv/vxor.vv * vmseq.vv/vmsne.vv * vmin.vv/vminu.vv/vmax.vv/vmaxu.vv * vmul.vv/vmulh.vv/vmulhu.vv * vwadd.vv/vwaddu.vv * vwmul.vv/vwmulu * vwmacc.vv/vwmaccu.vv * vadc.vvm There is no test change, I may add it later. Fixes part of #64422 Reviewers: michaelmaitland, preames, lukel97, topperc, asb Reviewed By: topperc, lukel97 Pull Request: https://github.com/llvm/llvm-project/pull/88379 --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 66 +++++++++++++ .../Target/RISCV/RISCVInstrInfoVPseudos.td | 92 ++++++++++--------- 2 files changed, 116 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index b0fda040519a57..668062c8d33f6f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2718,6 +2718,50 @@ std::string RISCVInstrInfo::createMIROperandComment( return Comment; } +// clang-format off +#define CASE_RVV_OPCODE_UNMASK_LMUL(OP, LMUL) \ + RISCV::Pseudo##OP##_##LMUL + +#define CASE_RVV_OPCODE_MASK_LMUL(OP, LMUL) \ + RISCV::Pseudo##OP##_##LMUL##_MASK + +#define CASE_RVV_OPCODE_LMUL(OP, LMUL) \ + CASE_RVV_OPCODE_UNMASK_LMUL(OP, LMUL): \ + case CASE_RVV_OPCODE_MASK_LMUL(OP, LMUL) + +#define CASE_RVV_OPCODE_UNMASK_WIDEN(OP) \ + CASE_RVV_OPCODE_UNMASK_LMUL(OP, MF8): \ + case CASE_RVV_OPCODE_UNMASK_LMUL(OP, MF4): \ + case CASE_RVV_OPCODE_UNMASK_LMUL(OP, MF2): \ + case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M1): \ + case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M2): \ + case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M4) + +#define CASE_RVV_OPCODE_UNMASK(OP) \ + CASE_RVV_OPCODE_UNMASK_WIDEN(OP): \ + case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M8) + +#define CASE_RVV_OPCODE_MASK_WIDEN(OP) \ + CASE_RVV_OPCODE_MASK_LMUL(OP, MF8): \ + case CASE_RVV_OPCODE_MASK_LMUL(OP, MF4): \ + case CASE_RVV_OPCODE_MASK_LMUL(OP, MF2): \ + case CASE_RVV_OPCODE_MASK_LMUL(OP, M1): \ + case CASE_RVV_OPCODE_MASK_LMUL(OP, M2): \ + case CASE_RVV_OPCODE_MASK_LMUL(OP, M4) + +#define CASE_RVV_OPCODE_MASK(OP) \ + CASE_RVV_OPCODE_MASK_WIDEN(OP): \ + case CASE_RVV_OPCODE_MASK_LMUL(OP, M8) + +#define CASE_RVV_OPCODE_WIDEN(OP) \ + CASE_RVV_OPCODE_UNMASK_WIDEN(OP): \ + case CASE_RVV_OPCODE_MASK_WIDEN(OP) + +#define CASE_RVV_OPCODE(OP) \ + CASE_RVV_OPCODE_UNMASK(OP): \ + case CASE_RVV_OPCODE_MASK(OP) +// clang-format on + // clang-format off #define CASE_VMA_OPCODE_COMMON(OP, TYPE, LMUL) \ RISCV::PseudoV##OP##_##TYPE##_##LMUL @@ -2798,6 +2842,28 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, case RISCV::PseudoCCMOVGPR: // Operands 4 and 5 are commutable. return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 4, 5); + case CASE_RVV_OPCODE(VADD_VV): + case CASE_RVV_OPCODE(VAND_VV): + case CASE_RVV_OPCODE(VOR_VV): + case CASE_RVV_OPCODE(VXOR_VV): + case CASE_RVV_OPCODE_MASK(VMSEQ_VV): + case CASE_RVV_OPCODE_MASK(VMSNE_VV): + case CASE_RVV_OPCODE(VMIN_VV): + case CASE_RVV_OPCODE(VMINU_VV): + case CASE_RVV_OPCODE(VMAX_VV): + case CASE_RVV_OPCODE(VMAXU_VV): + case CASE_RVV_OPCODE(VMUL_VV): + case CASE_RVV_OPCODE(VMULH_VV): + case CASE_RVV_OPCODE(VMULHU_VV): + case CASE_RVV_OPCODE_WIDEN(VWADD_VV): + case CASE_RVV_OPCODE_WIDEN(VWADDU_VV): + case CASE_RVV_OPCODE_WIDEN(VWMUL_VV): + case CASE_RVV_OPCODE_WIDEN(VWMULU_VV): + case CASE_RVV_OPCODE_WIDEN(VWMACC_VV): + case CASE_RVV_OPCODE_WIDEN(VWMACCU_VV): + case CASE_RVV_OPCODE_UNMASK(VADC_VVM): + // Operands 2 and 3 are commutable. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3); case CASE_VFMA_SPLATS(FMADD): case CASE_VFMA_SPLATS(FMSUB): case CASE_VFMA_SPLATS(FMACC): diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index ad1821d57256bc..435cd7f84c6122 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -2127,8 +2127,9 @@ multiclass VPseudoBinary { - let VLMul = MInfo.value, SEW=sew in { + int TargetConstraintType = 1, + bit Commutable = 0> { + let VLMul = MInfo.value, SEW=sew, isCommutable = Commutable in { defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); def suffix : VPseudoBinaryNoMaskTU; @@ -2167,8 +2168,9 @@ multiclass VPseudoBinaryM { - let VLMul = MInfo.value in { + int TargetConstraintType = 1, + bit Commutable = 0> { + let VLMul = MInfo.value, isCommutable = Commutable in { def "_" # MInfo.MX : VPseudoBinaryMOutNoMask; let ForceTailAgnostic = true in @@ -2226,8 +2228,8 @@ multiclass VPseudoTiedBinaryRoundingMode { - defm _VV : VPseudoBinary; +multiclass VPseudoBinaryV_VV { + defm _VV : VPseudoBinary; } multiclass VPseudoBinaryV_VV_RM { @@ -2331,9 +2333,10 @@ multiclass VPseudoVALU_MM { // * The destination EEW is greater than the source EEW, the source EMUL is // at least 1, and the overlap is in the highest-numbered part of the // destination register group is legal. Otherwise, it is illegal. -multiclass VPseudoBinaryW_VV { +multiclass VPseudoBinaryW_VV { defm _VV : VPseudoBinary; + "@earlyclobber $rd", TargetConstraintType=3, + Commutable=Commutable>; } multiclass VPseudoBinaryW_VV_RM { @@ -2453,7 +2456,9 @@ multiclass VPseudoBinaryV_VM; } -multiclass VPseudoTiedBinaryV_VM { +multiclass VPseudoTiedBinaryV_VM { + let isCommutable = Commutable in def "_VVM" # "_" # m.MX: VPseudoTiedBinaryCarryIn.R, m.vrclass, m.vrclass, m, 1, "", @@ -2667,9 +2672,11 @@ multiclass PseudoVEXT_VF8 { // lowest-numbered part of the source register group". // With LMUL<=1 the source and dest occupy a single register so any overlap // is in the lowest-numbered part. -multiclass VPseudoBinaryM_VV { +multiclass VPseudoBinaryM_VV { defm _VV : VPseudoBinaryM; + !if(!ge(m.octuple, 16), "@earlyclobber $rd", ""), + TargetConstraintType, Commutable=Commutable>; } multiclass VPseudoBinaryM_VX { @@ -2751,10 +2758,11 @@ multiclass VPseudoVSSHT_VV_VX_VI_RM { +multiclass VPseudoVALU_VV_VX_VI { foreach m = MxList in { defvar mx = m.MX; - defm "" : VPseudoBinaryV_VV, + defm "" : VPseudoBinaryV_VV, SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx, forceMergeOpRead=true>; defm "" : VPseudoBinaryV_VX, @@ -2804,17 +2812,17 @@ multiclass VPseudoVAALU_VV_VX_RM { multiclass VPseudoVMINMAX_VV_VX { foreach m = MxList in { defvar mx = m.MX; - defm "" : VPseudoBinaryV_VV, + defm "" : VPseudoBinaryV_VV, SchedBinary<"WriteVIMinMaxV", "ReadVIMinMaxV", "ReadVIMinMaxV", mx>; defm "" : VPseudoBinaryV_VX, SchedBinary<"WriteVIMinMaxX", "ReadVIMinMaxV", "ReadVIMinMaxX", mx>; } } -multiclass VPseudoVMUL_VV_VX { +multiclass VPseudoVMUL_VV_VX { foreach m = MxList in { defvar mx = m.MX; - defm "" : VPseudoBinaryV_VV, + defm "" : VPseudoBinaryV_VV, SchedBinary<"WriteVIMulV", "ReadVIMulV", "ReadVIMulV", mx>; defm "" : VPseudoBinaryV_VX, SchedBinary<"WriteVIMulX", "ReadVIMulV", "ReadVIMulX", mx>; @@ -2964,10 +2972,10 @@ multiclass VPseudoVALU_VX_VI { } } -multiclass VPseudoVWALU_VV_VX { +multiclass VPseudoVWALU_VV_VX { foreach m = MxListW in { defvar mx = m.MX; - defm "" : VPseudoBinaryW_VV, + defm "" : VPseudoBinaryW_VV, SchedBinary<"WriteVIWALUV", "ReadVIWALUV", "ReadVIWALUV", mx, forceMergeOpRead=true>; defm "" : VPseudoBinaryW_VX, @@ -2976,10 +2984,10 @@ multiclass VPseudoVWALU_VV_VX { } } -multiclass VPseudoVWMUL_VV_VX { +multiclass VPseudoVWMUL_VV_VX { foreach m = MxListW in { defvar mx = m.MX; - defm "" : VPseudoBinaryW_VV, + defm "" : VPseudoBinaryW_VV, SchedBinary<"WriteVIWMulV", "ReadVIWMulV", "ReadVIWMulV", mx, forceMergeOpRead=true>; defm "" : VPseudoBinaryW_VX, @@ -3074,7 +3082,7 @@ multiclass VPseudoVMRG_VM_XM_IM { multiclass VPseudoVCALU_VM_XM_IM { foreach m = MxList in { defvar mx = m.MX; - defm "" : VPseudoTiedBinaryV_VM, + defm "" : VPseudoTiedBinaryV_VM, SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx, forceMergeOpRead=true>; defm "" : VPseudoTiedBinaryV_XM, @@ -3287,10 +3295,10 @@ multiclass VPseudoTernaryV_VF_AAXA_RM; } -multiclass VPseudoTernaryW_VV { +multiclass VPseudoTernaryW_VV { defvar constraint = "@earlyclobber $rd"; defm _VV : VPseudoTernaryWithPolicy; + constraint, Commutable=Commutable, TargetConstraintType=3>; } multiclass VPseudoTernaryW_VV_RM { @@ -3380,10 +3388,10 @@ multiclass VPseudoVSLD_VX_VI { } } -multiclass VPseudoVWMAC_VV_VX { +multiclass VPseudoVWMAC_VV_VX { foreach m = MxListW in { defvar mx = m.MX; - defm "" : VPseudoTernaryW_VV, + defm "" : VPseudoTernaryW_VV, SchedTernary<"WriteVIWMulAddV", "ReadVIWMulAddV", "ReadVIWMulAddV", "ReadVIWMulAddV", mx>; defm "" : VPseudoTernaryW_VX, @@ -3436,10 +3444,10 @@ multiclass VPseudoVWMAC_VV_VF_BF_RM { } } -multiclass VPseudoVCMPM_VV_VX_VI { +multiclass VPseudoVCMPM_VV_VX_VI { foreach m = MxList in { defvar mx = m.MX; - defm "" : VPseudoBinaryM_VV, + defm "" : VPseudoBinaryM_VV, SchedBinary<"WriteVICmpV", "ReadVICmpV", "ReadVICmpV", mx>; defm "" : VPseudoBinaryM_VX, SchedBinary<"WriteVICmpX", "ReadVICmpV", "ReadVICmpX", mx>; @@ -6248,7 +6256,7 @@ defm PseudoVLSEG : VPseudoUSSegLoadFF; //===----------------------------------------------------------------------===// // 11.1. Vector Single-Width Integer Add and Subtract //===----------------------------------------------------------------------===// -defm PseudoVADD : VPseudoVALU_VV_VX_VI; +defm PseudoVADD : VPseudoVALU_VV_VX_VI; defm PseudoVSUB : VPseudoVALU_VV_VX; defm PseudoVRSUB : VPseudoVALU_VX_VI; @@ -6313,9 +6321,9 @@ foreach vti = AllIntegerVectors in { //===----------------------------------------------------------------------===// // 11.2. Vector Widening Integer Add/Subtract //===----------------------------------------------------------------------===// -defm PseudoVWADDU : VPseudoVWALU_VV_VX; +defm PseudoVWADDU : VPseudoVWALU_VV_VX; defm PseudoVWSUBU : VPseudoVWALU_VV_VX; -defm PseudoVWADD : VPseudoVWALU_VV_VX; +defm PseudoVWADD : VPseudoVWALU_VV_VX; defm PseudoVWSUB : VPseudoVWALU_VV_VX; defm PseudoVWADDU : VPseudoVWALU_WV_WX; defm PseudoVWSUBU : VPseudoVWALU_WV_WX; @@ -6346,9 +6354,9 @@ defm PseudoVMSBC : VPseudoVCALUM_V_X<"@earlyclobber $rd">; //===----------------------------------------------------------------------===// // 11.5. Vector Bitwise Logical Instructions //===----------------------------------------------------------------------===// -defm PseudoVAND : VPseudoVALU_VV_VX_VI; -defm PseudoVOR : VPseudoVALU_VV_VX_VI; -defm PseudoVXOR : VPseudoVALU_VV_VX_VI; +defm PseudoVAND : VPseudoVALU_VV_VX_VI; +defm PseudoVOR : VPseudoVALU_VV_VX_VI; +defm PseudoVXOR : VPseudoVALU_VV_VX_VI; //===----------------------------------------------------------------------===// // 11.6. Vector Single-Width Bit Shift Instructions @@ -6366,8 +6374,8 @@ defm PseudoVNSRA : VPseudoVNSHT_WV_WX_WI; //===----------------------------------------------------------------------===// // 11.8. Vector Integer Comparison Instructions //===----------------------------------------------------------------------===// -defm PseudoVMSEQ : VPseudoVCMPM_VV_VX_VI; -defm PseudoVMSNE : VPseudoVCMPM_VV_VX_VI; +defm PseudoVMSEQ : VPseudoVCMPM_VV_VX_VI; +defm PseudoVMSNE : VPseudoVCMPM_VV_VX_VI; defm PseudoVMSLTU : VPseudoVCMPM_VV_VX; defm PseudoVMSLT : VPseudoVCMPM_VV_VX; defm PseudoVMSLEU : VPseudoVCMPM_VV_VX_VI; @@ -6386,9 +6394,9 @@ defm PseudoVMAX : VPseudoVMINMAX_VV_VX; //===----------------------------------------------------------------------===// // 11.10. Vector Single-Width Integer Multiply Instructions //===----------------------------------------------------------------------===// -defm PseudoVMUL : VPseudoVMUL_VV_VX; -defm PseudoVMULH : VPseudoVMUL_VV_VX; -defm PseudoVMULHU : VPseudoVMUL_VV_VX; +defm PseudoVMUL : VPseudoVMUL_VV_VX; +defm PseudoVMULH : VPseudoVMUL_VV_VX; +defm PseudoVMULHU : VPseudoVMUL_VV_VX; defm PseudoVMULHSU : VPseudoVMUL_VV_VX; //===----------------------------------------------------------------------===// @@ -6402,8 +6410,8 @@ defm PseudoVREM : VPseudoVDIV_VV_VX; //===----------------------------------------------------------------------===// // 11.12. Vector Widening Integer Multiply Instructions //===----------------------------------------------------------------------===// -defm PseudoVWMUL : VPseudoVWMUL_VV_VX; -defm PseudoVWMULU : VPseudoVWMUL_VV_VX; +defm PseudoVWMUL : VPseudoVWMUL_VV_VX; +defm PseudoVWMULU : VPseudoVWMUL_VV_VX; defm PseudoVWMULSU : VPseudoVWMUL_VV_VX; //===----------------------------------------------------------------------===// @@ -6417,8 +6425,8 @@ defm PseudoVNMSUB : VPseudoVMAC_VV_VX_AAXA; //===----------------------------------------------------------------------===// // 11.14. Vector Widening Integer Multiply-Add Instructions //===----------------------------------------------------------------------===// -defm PseudoVWMACCU : VPseudoVWMAC_VV_VX; -defm PseudoVWMACC : VPseudoVWMAC_VV_VX; +defm PseudoVWMACCU : VPseudoVWMAC_VV_VX; +defm PseudoVWMACC : VPseudoVWMAC_VV_VX; defm PseudoVWMACCSU : VPseudoVWMAC_VV_VX; defm PseudoVWMACCUS : VPseudoVWMAC_VX; From 5b811562a520a8a3cd164897f24dee7da3115bbe Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 16 Apr 2024 04:33:39 -0400 Subject: [PATCH 048/300] [libc++] Rename __cpu_traits functions (#88741) Functions inside __cpu_traits were needlessly prefixed with __parallel, which doesn't serve a real purpose anymore now that they are inside a traits class. --- .../pstl_backends/cpu_backends/any_of.h | 2 +- .../pstl_backends/cpu_backends/fill.h | 2 +- .../pstl_backends/cpu_backends/find_if.h | 2 +- .../pstl_backends/cpu_backends/for_each.h | 2 +- .../pstl_backends/cpu_backends/libdispatch.h | 24 ++++++++--------- .../pstl_backends/cpu_backends/merge.h | 2 +- .../pstl_backends/cpu_backends/serial.h | 24 ++++++++--------- .../pstl_backends/cpu_backends/stable_sort.h | 2 +- .../pstl_backends/cpu_backends/thread.h | 24 ++++++++--------- .../pstl_backends/cpu_backends/transform.h | 4 +-- .../cpu_backends/transform_reduce.h | 4 +-- libcxx/include/__pstl/cpu_algos/cpu_traits.h | 27 +++++++++---------- 12 files changed, 59 insertions(+), 60 deletions(-) diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h index be5e54f3fa5c85..3755d288047e0b 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h @@ -34,7 +34,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI optional __parallel_or(_Index __first, _Index __last, _Brick __f) { std::atomic __found(false); - auto __ret = __pstl::__cpu_traits<_Backend>::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) { + auto __ret = __pstl::__cpu_traits<_Backend>::__for_each(__first, __last, [__f, &__found](_Index __i, _Index __j) { if (!__found.load(std::memory_order_relaxed) && __f(__i, __j)) { __found.store(true, std::memory_order_relaxed); __pstl::__cpu_traits<_Backend>::__cancel_execution(); diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h index 49a32f6c5ce551..0c20bdff62675a 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h @@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __pstl_fill(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { - return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for( + return __pstl::__cpu_traits<__cpu_backend_tag>::__for_each( __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { [[maybe_unused]] auto __res = std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>( __cpu_backend_tag{}, __brick_first, __brick_last, __value); diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h index 11a5668bf25af1..626293faef6921 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h @@ -42,7 +42,7 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool _DifferenceType __initial_dist = __b_first ? __n : -1; std::atomic<_DifferenceType> __extremum(__initial_dist); // TODO: find out what is better here: parallel_for or parallel_reduce - auto __res = __pstl::__cpu_traits<_Backend>::__parallel_for( + auto __res = __pstl::__cpu_traits<_Backend>::__for_each( __first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) { // See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of // why using a shared variable scales fairly well in this situation. diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h index 1667ec0f0c4f41..d637084e151d81 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h @@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __pstl_for_each(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) { if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { - return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for( + return __pstl::__cpu_traits<__cpu_backend_tag>::__for_each( __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { [[maybe_unused]] auto __res = std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>( __cpu_backend_tag{}, __brick_first, __brick_last, __func); diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h index 8757f249680375..17faadf55dd4fa 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h @@ -85,7 +85,7 @@ template <> struct __cpu_traits<__libdispatch_backend_tag> { template _LIBCPP_HIDE_FROM_ABI static optional<__empty> - __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { + __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { return __libdispatch::__dispatch_parallel_for( __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func)); } @@ -105,14 +105,14 @@ struct __cpu_traits<__libdispatch_backend_tag> { typename _RandomAccessIterator3, typename _Compare, typename _LeafMerge> - _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge( - _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __result, - _Compare __comp, - _LeafMerge __leaf_merge) noexcept { + _LIBCPP_HIDE_FROM_ABI static optional<__empty> + __merge(_RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, + _RandomAccessIterator2 __first2, + _RandomAccessIterator2 __last2, + _RandomAccessIterator3 __result, + _Compare __comp, + _LeafMerge __leaf_merge) noexcept { __libdispatch::__chunk_partitions __partitions = __libdispatch::__partition_chunks(std::max(__last1 - __first1, __last2 - __first2)); @@ -201,7 +201,7 @@ struct __cpu_traits<__libdispatch_backend_tag> { } template - _LIBCPP_HIDE_FROM_ABI static optional<_Value> __parallel_transform_reduce( + _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce( _RandomAccessIterator __first, _RandomAccessIterator __last, _Transform __transform, @@ -248,8 +248,8 @@ struct __cpu_traits<__libdispatch_backend_tag> { } template - _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort( - _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) { + _LIBCPP_HIDE_FROM_ABI static optional<__empty> + __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) { const auto __size = __last - __first; auto __partitions = __libdispatch::__partition_chunks(__size); diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h index d034447904872e..c93f4051c9d094 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h @@ -46,7 +46,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_merge( __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value && __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { - auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_merge( + auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__merge( __first1, __last1, __first2, diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h index c3d2905daed170..7544619a8eefd8 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h @@ -35,20 +35,20 @@ template <> struct __cpu_traits<__serial_backend_tag> { template _LIBCPP_HIDE_FROM_ABI static optional<__empty> - __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) { + __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) { __f(__first, __last); return __empty{}; } template _LIBCPP_HIDE_FROM_ABI static optional<_Tp> - __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) { + __transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) { return __reduce(std::move(__first), std::move(__last), std::move(__init)); } template - _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort( - _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) { + _LIBCPP_HIDE_FROM_ABI static optional<__empty> + __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) { __leaf_sort(__first, __last, __comp); return __empty{}; } @@ -60,14 +60,14 @@ struct __cpu_traits<__serial_backend_tag> { class _RandomAccessIterator3, class _Compare, class _LeafMerge> - _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge( - _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __outit, - _Compare __comp, - _LeafMerge __leaf_merge) { + _LIBCPP_HIDE_FROM_ABI static optional<__empty> + __merge(_RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, + _RandomAccessIterator2 __first2, + _RandomAccessIterator2 __last2, + _RandomAccessIterator3 __outit, + _Compare __comp, + _LeafMerge __leaf_merge) { __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp); return __empty{}; } diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h index ebfa0fc69147d5..8c60cf897ff860 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h @@ -29,7 +29,7 @@ template _LIBCPP_HIDE_FROM_ABI optional<__empty> __pstl_stable_sort(__cpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) { if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy>) { - return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_stable_sort( + return __pstl::__cpu_traits<__cpu_backend_tag>::__stable_sort( __first, __last, __comp, [](_RandomAccessIterator __g_first, _RandomAccessIterator __g_last, _Comp __g_comp) { std::stable_sort(__g_first, __g_last, __g_comp); }); diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h index 8d1cb221c3d82a..2acf912264a001 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h @@ -38,20 +38,20 @@ template <> struct __cpu_traits<__std_thread_backend_tag> { template _LIBCPP_HIDE_FROM_ABI static optional<__empty> - __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) { + __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) { __f(__first, __last); return __empty{}; } template _LIBCPP_HIDE_FROM_ABI static optional<_Tp> - __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) { + __transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) { return __reduce(std::move(__first), std::move(__last), std::move(__init)); } template - _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort( - _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) { + _LIBCPP_HIDE_FROM_ABI static optional<__empty> + __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) { __leaf_sort(__first, __last, __comp); return __empty{}; } @@ -63,14 +63,14 @@ struct __cpu_traits<__std_thread_backend_tag> { class _RandomAccessIterator3, class _Compare, class _LeafMerge> - _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge( - _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __outit, - _Compare __comp, - _LeafMerge __leaf_merge) { + _LIBCPP_HIDE_FROM_ABI static optional<__empty> + __merge(_RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, + _RandomAccessIterator2 __first2, + _RandomAccessIterator2 __last2, + _RandomAccessIterator3 __outit, + _Compare __comp, + _LeafMerge __leaf_merge) { __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp); return __empty{}; } diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h index d4c383997a67a9..4b9b2968668327 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h @@ -50,7 +50,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform( if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { - __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for( + __pstl::__cpu_traits<__cpu_backend_tag>::__for_each( __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { auto __res = std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>( __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op); @@ -98,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform( __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value && __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { - auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for( + auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__for_each( __first1, __last1, [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) { diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h index 956c7d6a88ce29..c074eea9861c1b 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h @@ -120,7 +120,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce( if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value && __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) { - return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce( + return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce( __first1, std::move(__last1), [__first1, __first2, __transform](_ForwardIterator1 __iter) { @@ -167,7 +167,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce( _UnaryOperation __transform) { if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { - return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce( + return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce( std::move(__first), std::move(__last), [__transform](_ForwardIterator __iter) { return __transform(*__iter); }, diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h index 2f0db46e9be83a..0483d6918fd01d 100644 --- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h +++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h @@ -32,31 +32,30 @@ namespace __pstl { // ================ // // template -// optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func); +// optional<__empty> __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func); // - __func must take a subrange of [__first, __last) that should be executed in serial // // template -// optional<_Tp> __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, -// _Reduction); +// optional<_Tp> __transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction); // // template -// optional<_RandomAccessIterator3> __parallel_merge(_RandomAccessIterator1 __first1, -// _RandomAccessIterator1 __last1, -// _RandomAccessIterator2 __first2, -// _RandomAccessIterator2 __last2, -// _RandomAccessIterator3 __outit, -// _Compare __comp, -// _LeafMerge __leaf_merge); +// optional<_RandomAccessIterator3> __merge(_RandomAccessIterator1 __first1, +// _RandomAccessIterator1 __last1, +// _RandomAccessIterator2 __first2, +// _RandomAccessIterator2 __last2, +// _RandomAccessIterator3 __outit, +// _Compare __comp, +// _LeafMerge __leaf_merge); // // template -// optional<__empty> __parallel_stable_sort(_RandomAccessIterator __first, -// _RandomAccessIterator __last, -// _Comp __comp, -// _LeafSort __leaf_sort); +// optional<__empty> __stable_sort(_RandomAccessIterator __first, +// _RandomAccessIterator __last, +// _Comp __comp, +// _LeafSort __leaf_sort); // // void __cancel_execution(); // Cancel the execution of other jobs - they aren't needed anymore. This is not a binding request, From 66cf995da76b9da3cfdee2f29eff6ea4d84305ef Mon Sep 17 00:00:00 2001 From: Carlos Alberto Enciso Date: Tue, 16 Apr 2024 09:37:51 +0100 Subject: [PATCH 049/300] [IPSCCP] Variable not visible at Og: (#77901) https://bugs.llvm.org/show_bug.cgi?id=51559 https://github.com/llvm/llvm-project/issues/50901 IPSCCP pass removes the global variable and does not create a constant expression for the initializer value. Extend test coverage to include: - half, bfloat types. - checks for undef (int32 and ptr). There is no support for: - fp128, x86_fp80, ppc_fp128 types. https://github.com/llvm/llvm-project/issues/88102 --- llvm/lib/Transforms/Utils/Local.cpp | 8 ++-- llvm/test/Transforms/SCCP/pr50901.ll | 41 ++++++++++++++++++- llvm/unittests/Transforms/Utils/LocalTest.cpp | 21 ++++++---- 3 files changed, 57 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 380bac9c618077..a42ef0c4e6ae9e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3627,10 +3627,12 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C, return createIntegerExpression(C); auto *FP = dyn_cast(&C); - if (FP && (Ty.isFloatTy() || Ty.isDoubleTy())) { + if (FP && Ty.isFloatingPointTy() && Ty.getScalarSizeInBits() <= 64) { const APFloat &APF = FP->getValueAPF(); - return DIB.createConstantValueExpression( - APF.bitcastToAPInt().getZExtValue()); + APInt const &API = APF.bitcastToAPInt(); + if (auto Temp = API.getZExtValue()) + return DIB.createConstantValueExpression(static_cast(Temp)); + return DIB.createConstantValueExpression(*API.getRawData()); } if (!Ty.isPointerTy()) diff --git a/llvm/test/Transforms/SCCP/pr50901.ll b/llvm/test/Transforms/SCCP/pr50901.ll index 11d6bba6f6a935..d48d67532d88bd 100644 --- a/llvm/test/Transforms/SCCP/pr50901.ll +++ b/llvm/test/Transforms/SCCP/pr50901.ll @@ -52,6 +52,16 @@ ; CHECK: = !DIGlobalVariableExpression(var: ![[DBG_FLOAT_UNDEF:.+]], expr: !DIExpression()) ; CHECK-DAG: ![[DBG_FLOAT_UNDEF]] = distinct !DIGlobalVariable(name: "g_float_undef" +; CHECK: ![[G8:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBG8:[0-9]+]], expr: !DIExpression(DW_OP_constu, 22136, DW_OP_stack_value)) +; CHECK-DAG: ![[DBG8]] = distinct !DIGlobalVariable(name: "g_88", {{.*}} +; CHECK: ![[G9:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBG9:[0-9]+]], expr: !DIExpression(DW_OP_constu, 23726, DW_OP_stack_value)) +; CHECK-DAG: ![[DBG9]] = distinct !DIGlobalVariable(name: "g_99", {{.*}} + +; CHECK-DAG: ![[DBGA:[0-9]+]] = distinct !DIGlobalVariable(name: "g_i32_undef" +; CHECK-DAG: ![[GA:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBGA]], expr: !DIExpression()) +; CHECK-DAG: ![[DBGB:[0-9]+]] = distinct !DIGlobalVariable(name: "g_ptr_undef" +; CHECK-DAG: ![[GB:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBGB]], expr: !DIExpression()) + @g_1 = dso_local global i32 -4, align 4, !dbg !0 @g_2 = dso_local global float 0x4011C28F60000000, align 4, !dbg !8 @g_3 = dso_local global i8 97, align 1, !dbg !10 @@ -59,6 +69,8 @@ @g_5 = dso_local global i8 1, align 1, !dbg !16 @g_6 = dso_local global ptr null, align 8, !dbg !19 @g_7 = dso_local global ptr null, align 8, !dbg !23 +@g_8 = dso_local global half 0xH4321, align 4, !dbg !86 +@g_9 = dso_local global bfloat 0xR3F80, align 4, !dbg !90 @_ZL4g_11 = internal global i32 -5, align 4, !dbg !25 @_ZL4g_22 = internal global float 0x4016333340000000, align 4, !dbg !27 @_ZL4g_33 = internal global i8 98, align 1, !dbg !29 @@ -67,6 +79,10 @@ @_ZL4g_66 = internal global ptr null, align 8, !dbg !35 @_ZL4g_77 = internal global ptr inttoptr (i64 70 to ptr), align 8, !dbg !37 @g_float_undef = internal global float undef, align 4, !dbg !83 +@_ZL4g_88 = internal global half 0xH5678, align 4, !dbg !88 +@_ZL4g_99 = internal global bfloat 0xR5CAE, align 4, !dbg !92 +@g_i32_undef = internal global i32 undef, align 4, !dbg !95 +@g_ptr_undef = internal global ptr undef, align 8, !dbg !97 define dso_local void @_Z3barv() !dbg !46 { entry: @@ -88,6 +104,15 @@ entry: store ptr %6, ptr @g_7, align 8, !dbg !59 %l = load float, ptr @g_float_undef, align 8, !dbg !59 store float %l, ptr @g_2, align 8, !dbg !59 + %7 = load half, ptr @_ZL4g_88, align 4, !dbg !59 + store half %7, ptr @g_8, align 4, !dbg !59 + %8 = load bfloat, ptr @_ZL4g_99, align 4, !dbg !59 + store bfloat %8, ptr @g_9, align 4, !dbg !59 + %9 = load i32, ptr @g_i32_undef, align 4, !dbg !59 + store i32 %9, ptr @g_1, align 4, !dbg !59 + %10 = load ptr, ptr @g_ptr_undef, align 8, !dbg !59 + store ptr %10, ptr @g_6, align 8, !dbg !59 + ret void, !dbg !59 } @@ -108,7 +133,7 @@ entry: !4 = !{!5} !5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64) !6 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) -!7 = !{!0, !8, !10, !13, !16, !19, !23, !25, !27, !29, !31, !33, !35, !37, !83} +!7 = !{!0, !8, !10, !13, !16, !19, !23, !25, !27, !29, !31, !33, !35, !37, !83, !86, !88, !90, !92, !95, !97} !8 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression()) !9 = distinct !DIGlobalVariable(name: "g_2", scope: !2, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true) !10 = !DIGlobalVariableExpression(var: !11, expr: !DIExpression()) @@ -159,3 +184,17 @@ entry: !82 = !DILocation(line: 31, column: 1, scope: !77) !83 = !DIGlobalVariableExpression(var: !84, expr: !DIExpression()) !84 = distinct !DIGlobalVariable(name: "g_float_undef", linkageName: "g_float_undef", scope: !2, file: !3, line: 15, type: !6, isLocal: true, isDefinition: true) +!85 = !DIBasicType(name: "float", size: 16, encoding: DW_ATE_float) +!86 = !DIGlobalVariableExpression(var: !87, expr: !DIExpression()) +!87 = distinct !DIGlobalVariable(name: "g_8", scope: !2, file: !3, line: 2, type: !85, isLocal: false, isDefinition: true) +!88 = !DIGlobalVariableExpression(var: !89, expr: !DIExpression()) +!89 = distinct !DIGlobalVariable(name: "g_88", linkageName: "_ZL4g_88", scope: !2, file: !3, line: 10, type: !85, isLocal: true, isDefinition: true) +!90 = !DIGlobalVariableExpression(var: !91, expr: !DIExpression()) +!91 = distinct !DIGlobalVariable(name: "g_9", scope: !2, file: !3, line: 2, type: !85, isLocal: false, isDefinition: true) +!92 = !DIGlobalVariableExpression(var: !93, expr: !DIExpression()) +!93 = distinct !DIGlobalVariable(name: "g_99", linkageName: "_ZL4g_99", scope: !2, file: !3, line: 10, type: !85, isLocal: true, isDefinition: true) + +!95 = !DIGlobalVariableExpression(var: !96, expr: !DIExpression()) +!96 = distinct !DIGlobalVariable(name: "g_i32_undef", linkageName: "g_i32_undef", scope: !2, file: !3, line: 9, type: !22, isLocal: true, isDefinition: true) +!97 = !DIGlobalVariableExpression(var: !98, expr: !DIExpression()) +!98 = distinct !DIGlobalVariable(name: "g_ptr_undef", linkageName: "g_ptr_undef", scope: !2, file: !3, line: 14, type: !21, isLocal: true, isDefinition: true) diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp index a86775a1366b05..d7d0ea2c6a6e79 100644 --- a/llvm/unittests/Transforms/Utils/LocalTest.cpp +++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp @@ -1241,6 +1241,18 @@ TEST(Local, ExpressionForConstant) { EXPECT_NE(Expr, nullptr); EXPECT_EQ(Expr->getElement(1), 13841306799765140275U); + // Half. + Type *HalfTy = Type::getHalfTy(Context); + Expr = createExpression(ConstantFP::get(HalfTy, 5.55), HalfTy); + EXPECT_NE(Expr, nullptr); + EXPECT_EQ(Expr->getElement(1), 17805U); + + // BFloat. + Type *BFloatTy = Type::getBFloatTy(Context); + Expr = createExpression(ConstantFP::get(BFloatTy, -5.55), BFloatTy); + EXPECT_NE(Expr, nullptr); + EXPECT_EQ(Expr->getElement(1), 49330U); + // Pointer. PointerType *PtrTy = PointerType::get(Context, 0); Expr = createExpression(ConstantPointerNull::get(PtrTy), PtrTy); @@ -1257,15 +1269,6 @@ TEST(Local, ExpressionForConstant) { EXPECT_NE(Expr, nullptr); EXPECT_EQ(Expr->getElement(1), 5678U); - // Others. - Type *HalfTy = Type::getHalfTy(Context); - Expr = createExpression(ConstantFP::get(HalfTy, 32), HalfTy); - EXPECT_EQ(Expr, nullptr); - - Type *BFloatTy = Type::getBFloatTy(Context); - Expr = createExpression(ConstantFP::get(BFloatTy, 32), BFloatTy); - EXPECT_EQ(Expr, nullptr); - Type *FP128Ty = Type::getFP128Ty(Context); Expr = createExpression(ConstantFP::get(FP128Ty, 32), FP128Ty); EXPECT_EQ(Expr, nullptr); From d6d84b5d1448e4f2e24b467a0abcf42fe9d543e9 Mon Sep 17 00:00:00 2001 From: NagyDonat Date: Tue, 16 Apr 2024 10:41:26 +0200 Subject: [PATCH 050/300] [analyzer] Handle builtin functions in MallocChecker (#88416) This commit ensures that the `CallDescription`s in `MallocChecker` are matched with the mode `CDM::CLibrary`, so: - they don't match methods or functions within user-defined namespaces; - they also match builtin variants of these functions (if any), so the checker can model `__builtin_alloca()` like `alloca()`. This change fixes https://github.com/llvm/llvm-project/issues/81597. New tests were added to verify that `std::malloc` and `std::free` (from ``) are modeled, but a method that's named e.g. `free` isn't confused with the memory release function. The responsibility for modeling `__builtin_alloca` and `__builtin_alloca_with_align` was moved from `BuiltinFunctionChecker` to `MallocChecker`, to avoid buggy interactions between the checkers and ensure that the builtin and non-builtin variants are handled by exactly the same logic. This change might be a step backwards for the users who don't have `unix.Malloc` enabled; but I suspect that `__builtin_alloca()` is so rare that it would be a waste of time to implement backwards compatibility for them. There were several test files that relied on `__builtin_alloca()` calls to get an `AllocaRegion`, these were modified to enable `unix.Malloc`. One of these files (cxx-uninitialized-object-ptr-ref.cpp) had some tests that relied on the fact that `malloc()` was treated as a "black box" in them, these were updated to use `calloc()` (to get initialized memory) and `free()` (to avoid memory leak reports). While I was developing this change, I found a very suspicious assert in `MallocChecker`. As it isn't blocking the goals of this commit, I just marked it with a FIXME, but I'll try to investigate and fix it in a follow-up change. --- .../Checkers/BuiltinFunctionChecker.cpp | 25 ++---- .../StaticAnalyzer/Checkers/MallocChecker.cpp | 89 ++++++++++--------- .../Inputs/system-header-simulator-cxx.h | 6 +- .../cxx-uninitialized-object-ptr-ref.cpp | 24 +++-- clang/test/Analysis/exercise-ps.c | 2 +- clang/test/Analysis/explain-svals.cpp | 2 +- clang/test/Analysis/malloc-std-namespace.cpp | 24 +++++ clang/test/Analysis/malloc.c | 11 +++ clang/test/Analysis/malloc.cpp | 11 +++ clang/test/Analysis/stack-addr-ps.c | 2 +- clang/test/Analysis/stackaddrleak.c | 4 +- 11 files changed, 125 insertions(+), 75 deletions(-) create mode 100644 clang/test/Analysis/malloc-std-namespace.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp index 01e46fa8591c07..1a75d7b52ad6e9 100644 --- a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp @@ -6,7 +6,11 @@ // //===----------------------------------------------------------------------===// // -// This checker evaluates clang builtin functions. +// This checker evaluates "standalone" clang builtin functions that are not +// just special-cased variants of well-known non-builtin functions. +// Builtin functions like __builtin_memcpy and __builtin_alloca should be +// evaluated by the same checker that handles their non-builtin variant to +// ensure that the two variants are handled consistently. // //===----------------------------------------------------------------------===// @@ -80,25 +84,6 @@ bool BuiltinFunctionChecker::evalCall(const CallEvent &Call, return true; } - case Builtin::BI__builtin_alloca_with_align: - case Builtin::BI__builtin_alloca: { - SValBuilder &SVB = C.getSValBuilder(); - const loc::MemRegionVal R = - SVB.getAllocaRegionVal(CE, C.getLocationContext(), C.blockCount()); - - // Set the extent of the region in bytes. This enables us to use the SVal - // of the argument directly. If we saved the extent in bits, it'd be more - // difficult to reason about values like symbol*8. - auto Size = Call.getArgSVal(0); - if (auto DefSize = Size.getAs()) { - // This `getAs()` is mostly paranoia, because core.CallAndMessage reports - // undefined function arguments (unless it's disabled somehow). - state = setDynamicExtent(state, R.getRegion(), *DefSize, SVB); - } - C.addTransition(state->BindExpr(CE, LCtx, R)); - return true; - } - case Builtin::BI__builtin_dynamic_object_size: case Builtin::BI__builtin_object_size: case Builtin::BI__builtin_constant_p: { diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp index 88fb42b6625aa4..11651fd491f743 100644 --- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp @@ -401,10 +401,11 @@ class MallocChecker }; const CallDescriptionMap FreeingMemFnMap{ - {{{"free"}, 1}, &MallocChecker::checkFree}, - {{{"if_freenameindex"}, 1}, &MallocChecker::checkIfFreeNameIndex}, - {{{"kfree"}, 1}, &MallocChecker::checkFree}, - {{{"g_free"}, 1}, &MallocChecker::checkFree}, + {{CDM::CLibrary, {"free"}, 1}, &MallocChecker::checkFree}, + {{CDM::CLibrary, {"if_freenameindex"}, 1}, + &MallocChecker::checkIfFreeNameIndex}, + {{CDM::CLibrary, {"kfree"}, 1}, &MallocChecker::checkFree}, + {{CDM::CLibrary, {"g_free"}, 1}, &MallocChecker::checkFree}, }; bool isFreeingCall(const CallEvent &Call) const; @@ -413,41 +414,46 @@ class MallocChecker friend class NoOwnershipChangeVisitor; CallDescriptionMap AllocatingMemFnMap{ - {{{"alloca"}, 1}, &MallocChecker::checkAlloca}, - {{{"_alloca"}, 1}, &MallocChecker::checkAlloca}, - {{{"malloc"}, 1}, &MallocChecker::checkBasicAlloc}, - {{{"malloc"}, 3}, &MallocChecker::checkKernelMalloc}, - {{{"calloc"}, 2}, &MallocChecker::checkCalloc}, - {{{"valloc"}, 1}, &MallocChecker::checkBasicAlloc}, + {{CDM::CLibrary, {"alloca"}, 1}, &MallocChecker::checkAlloca}, + {{CDM::CLibrary, {"_alloca"}, 1}, &MallocChecker::checkAlloca}, + // The line for "alloca" also covers "__builtin_alloca", but the + // _with_align variant must be listed separately because it takes an + // extra argument: + {{CDM::CLibrary, {"__builtin_alloca_with_align"}, 2}, + &MallocChecker::checkAlloca}, + {{CDM::CLibrary, {"malloc"}, 1}, &MallocChecker::checkBasicAlloc}, + {{CDM::CLibrary, {"malloc"}, 3}, &MallocChecker::checkKernelMalloc}, + {{CDM::CLibrary, {"calloc"}, 2}, &MallocChecker::checkCalloc}, + {{CDM::CLibrary, {"valloc"}, 1}, &MallocChecker::checkBasicAlloc}, {{CDM::CLibrary, {"strndup"}, 2}, &MallocChecker::checkStrdup}, {{CDM::CLibrary, {"strdup"}, 1}, &MallocChecker::checkStrdup}, - {{{"_strdup"}, 1}, &MallocChecker::checkStrdup}, - {{{"kmalloc"}, 2}, &MallocChecker::checkKernelMalloc}, - {{{"if_nameindex"}, 1}, &MallocChecker::checkIfNameIndex}, + {{CDM::CLibrary, {"_strdup"}, 1}, &MallocChecker::checkStrdup}, + {{CDM::CLibrary, {"kmalloc"}, 2}, &MallocChecker::checkKernelMalloc}, + {{CDM::CLibrary, {"if_nameindex"}, 1}, &MallocChecker::checkIfNameIndex}, {{CDM::CLibrary, {"wcsdup"}, 1}, &MallocChecker::checkStrdup}, {{CDM::CLibrary, {"_wcsdup"}, 1}, &MallocChecker::checkStrdup}, - {{{"g_malloc"}, 1}, &MallocChecker::checkBasicAlloc}, - {{{"g_malloc0"}, 1}, &MallocChecker::checkGMalloc0}, - {{{"g_try_malloc"}, 1}, &MallocChecker::checkBasicAlloc}, - {{{"g_try_malloc0"}, 1}, &MallocChecker::checkGMalloc0}, - {{{"g_memdup"}, 2}, &MallocChecker::checkGMemdup}, - {{{"g_malloc_n"}, 2}, &MallocChecker::checkGMallocN}, - {{{"g_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0}, - {{{"g_try_malloc_n"}, 2}, &MallocChecker::checkGMallocN}, - {{{"g_try_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0}, + {{CDM::CLibrary, {"g_malloc"}, 1}, &MallocChecker::checkBasicAlloc}, + {{CDM::CLibrary, {"g_malloc0"}, 1}, &MallocChecker::checkGMalloc0}, + {{CDM::CLibrary, {"g_try_malloc"}, 1}, &MallocChecker::checkBasicAlloc}, + {{CDM::CLibrary, {"g_try_malloc0"}, 1}, &MallocChecker::checkGMalloc0}, + {{CDM::CLibrary, {"g_memdup"}, 2}, &MallocChecker::checkGMemdup}, + {{CDM::CLibrary, {"g_malloc_n"}, 2}, &MallocChecker::checkGMallocN}, + {{CDM::CLibrary, {"g_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0}, + {{CDM::CLibrary, {"g_try_malloc_n"}, 2}, &MallocChecker::checkGMallocN}, + {{CDM::CLibrary, {"g_try_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0}, }; CallDescriptionMap ReallocatingMemFnMap{ - {{{"realloc"}, 2}, + {{CDM::CLibrary, {"realloc"}, 2}, std::bind(&MallocChecker::checkRealloc, _1, _2, _3, false)}, - {{{"reallocf"}, 2}, + {{CDM::CLibrary, {"reallocf"}, 2}, std::bind(&MallocChecker::checkRealloc, _1, _2, _3, true)}, - {{{"g_realloc"}, 2}, + {{CDM::CLibrary, {"g_realloc"}, 2}, std::bind(&MallocChecker::checkRealloc, _1, _2, _3, false)}, - {{{"g_try_realloc"}, 2}, + {{CDM::CLibrary, {"g_try_realloc"}, 2}, std::bind(&MallocChecker::checkRealloc, _1, _2, _3, false)}, - {{{"g_realloc_n"}, 3}, &MallocChecker::checkReallocN}, - {{{"g_try_realloc_n"}, 3}, &MallocChecker::checkReallocN}, + {{CDM::CLibrary, {"g_realloc_n"}, 3}, &MallocChecker::checkReallocN}, + {{CDM::CLibrary, {"g_try_realloc_n"}, 3}, &MallocChecker::checkReallocN}, // NOTE: the following CallDescription also matches the C++ standard // library function std::getline(); the callback will filter it out. @@ -1259,9 +1265,6 @@ static bool isStandardRealloc(const CallEvent &Call) { assert(FD); ASTContext &AC = FD->getASTContext(); - if (isa(FD)) - return false; - return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy && FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy && FD->getParamDecl(1)->getType().getDesugaredType(AC) == @@ -1273,9 +1276,6 @@ static bool isGRealloc(const CallEvent &Call) { assert(FD); ASTContext &AC = FD->getASTContext(); - if (isa(FD)) - return false; - return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy && FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy && FD->getParamDecl(1)->getType().getDesugaredType(AC) == @@ -1284,14 +1284,14 @@ static bool isGRealloc(const CallEvent &Call) { void MallocChecker::checkRealloc(const CallEvent &Call, CheckerContext &C, bool ShouldFreeOnFail) const { - // HACK: CallDescription currently recognizes non-standard realloc functions - // as standard because it doesn't check the type, or wether its a non-method - // function. This should be solved by making CallDescription smarter. - // Mind that this came from a bug report, and all other functions suffer from - // this. - // https://bugs.llvm.org/show_bug.cgi?id=46253 + // Ignore calls to functions whose type does not match the expected type of + // either the standard realloc or g_realloc from GLib. + // FIXME: Should we perform this kind of checking consistently for each + // function? If yes, then perhaps extend the `CallDescription` interface to + // handle this. if (!isStandardRealloc(Call) && !isGRealloc(Call)) return; + ProgramStateRef State = C.getState(); State = ReallocMemAux(C, Call, ShouldFreeOnFail, State, AF_Malloc); State = ProcessZeroAllocCheck(Call, 1, State); @@ -1842,9 +1842,18 @@ static ProgramStateRef MallocUpdateRefState(CheckerContext &C, const Expr *E, return nullptr; SymbolRef Sym = RetVal->getAsLocSymbol(); + // This is a return value of a function that was not inlined, such as malloc() // or new(). We've checked that in the caller. Therefore, it must be a symbol. assert(Sym); + // FIXME: In theory this assertion should fail for `alloca()` calls (because + // `AllocaRegion`s are not symbolic); but in practice this does not happen. + // As the current code appears to work correctly, I'm not touching this issue + // now, but it would be good to investigate and clarify this. + // Also note that perhaps the special `AllocaRegion` should be replaced by + // `SymbolicRegion` (or turned into a subclass of `SymbolicRegion`) to enable + // proper tracking of memory allocated by `alloca()` -- and after that change + // this assertion would become valid again. // Set the symbol's state to Allocated. return State->set(Sym, RefState::getAllocated(Family, E)); diff --git a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h index 85db68d41a6c80..1c2be322f83c20 100644 --- a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h +++ b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h @@ -1106,6 +1106,7 @@ using ostream = basic_ostream; extern std::ostream cout; ostream &operator<<(ostream &, const string &); + #if __cplusplus >= 202002L template ostream &operator<<(ostream &, const std::unique_ptr &); @@ -1122,11 +1123,12 @@ istream &getline(istream &, string &, char); istream &getline(istream &, string &); } // namespace std -#ifdef TEST_INLINABLE_ALLOCATORS namespace std { void *malloc(size_t); void free(void *); -} +} // namespace std + +#ifdef TEST_INLINABLE_ALLOCATORS void* operator new(std::size_t size, const std::nothrow_t&) throw() { return std::malloc(size); } void* operator new[](std::size_t size, const std::nothrow_t&) throw() { return std::malloc(size); } void operator delete(void* ptr, const std::nothrow_t&) throw() { std::free(ptr); } diff --git a/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp b/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp index fc067dd04428a8..f46a2c9bc368f6 100644 --- a/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp +++ b/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp @@ -1,9 +1,9 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=core,optin.cplusplus.UninitializedObject \ +// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,optin.cplusplus.UninitializedObject \ // RUN: -analyzer-config optin.cplusplus.UninitializedObject:Pedantic=true -DPEDANTIC \ // RUN: -analyzer-config optin.cplusplus.UninitializedObject:CheckPointeeInitialization=true \ // RUN: -std=c++11 -verify %s -// RUN: %clang_analyze_cc1 -analyzer-checker=core,optin.cplusplus.UninitializedObject \ +// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,optin.cplusplus.UninitializedObject \ // RUN: -analyzer-config optin.cplusplus.UninitializedObject:CheckPointeeInitialization=true \ // RUN: -std=c++11 -verify %s @@ -316,7 +316,10 @@ void fCyclicPointerTest2() { // Void pointer tests are mainly no-crash tests. -void *malloc(int size); +typedef __typeof(sizeof(int)) size_t; + +void *calloc(size_t nmemb, size_t size); +void free(void *p); class VoidPointerTest1 { void *vptr; @@ -328,8 +331,9 @@ class VoidPointerTest1 { }; void fVoidPointerTest1() { - void *vptr = malloc(sizeof(int)); + void *vptr = calloc(1, sizeof(int)); VoidPointerTest1(vptr, char()); + free(vptr); } class VoidPointerTest2 { @@ -342,8 +346,9 @@ class VoidPointerTest2 { }; void fVoidPointerTest2() { - void *vptr = malloc(sizeof(int)); + void *vptr = calloc(1, sizeof(int)); VoidPointerTest2(&vptr, char()); + free(vptr); } class VoidPointerRRefTest1 { @@ -359,8 +364,9 @@ upon returning to the caller. This will be a dangling reference}} }; void fVoidPointerRRefTest1() { - void *vptr = malloc(sizeof(int)); + void *vptr = calloc(1, sizeof(int)); VoidPointerRRefTest1(vptr, char()); + free(vptr); } class VoidPointerRRefTest2 { @@ -376,8 +382,9 @@ upon returning to the caller. This will be a dangling reference}} }; void fVoidPointerRRefTest2() { - void *vptr = malloc(sizeof(int)); + void *vptr = calloc(1, sizeof(int)); VoidPointerRRefTest2(&vptr, char()); + free(vptr); } class VoidPointerLRefTest { @@ -393,8 +400,9 @@ upon returning to the caller. This will be a dangling reference}} }; void fVoidPointerLRefTest() { - void *vptr = malloc(sizeof(int)); + void *vptr = calloc(1, sizeof(int)); VoidPointerLRefTest(vptr, char()); + free(vptr); } struct CyclicVoidPointerTest { diff --git a/clang/test/Analysis/exercise-ps.c b/clang/test/Analysis/exercise-ps.c index d214c3959b2078..d1e1771afddb5e 100644 --- a/clang/test/Analysis/exercise-ps.c +++ b/clang/test/Analysis/exercise-ps.c @@ -1,5 +1,5 @@ // RUN: %clang_analyze_cc1 %s -verify -Wno-error=implicit-function-declaration \ -// RUN: -analyzer-checker=core \ +// RUN: -analyzer-checker=core,unix.Malloc \ // RUN: -analyzer-config core.CallAndMessage:ArgPointeeInitializedness=true // // Just exercise the analyzer on code that has at one point caused issues diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp index 30368b6976cc23..33fce10c4e2b2c 100644 --- a/clang/test/Analysis/explain-svals.cpp +++ b/clang/test/Analysis/explain-svals.cpp @@ -1,7 +1,7 @@ // RUN: %clang_analyze_cc1 -triple i386-apple-darwin10 -verify %s \ -// RUN: -analyzer-checker=core.builtin \ // RUN: -analyzer-checker=debug.ExprInspection \ // RUN: -analyzer-checker=unix.cstring \ +// RUN: -analyzer-checker=unix.Malloc \ // RUN: -analyzer-config display-checker-name=false typedef unsigned long size_t; diff --git a/clang/test/Analysis/malloc-std-namespace.cpp b/clang/test/Analysis/malloc-std-namespace.cpp new file mode 100644 index 00000000000000..d4e397bb812aa9 --- /dev/null +++ b/clang/test/Analysis/malloc-std-namespace.cpp @@ -0,0 +1,24 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -analyzer-output=text %s + +// This file tests that unix.Malloc can handle C++ code where e.g. malloc and +// free are declared within the namespace 'std' by the header . + +#include "Inputs/system-header-simulator-cxx.h" + +void leak() { + int *p = static_cast(std::malloc(sizeof(int))); // expected-note{{Memory is allocated}} +} // expected-warning{{Potential leak of memory pointed to by 'p'}} + // expected-note@-1{{Potential leak of memory pointed to by 'p'}} + +void no_leak() { + int *p = static_cast(std::malloc(sizeof(int))); + std::free(p); // no-warning +} + +void invalid_free() { + int i; + int *p = &i; + //expected-note@+2{{Argument to free() is the address of the local variable 'i', which is not memory allocated by malloc()}} + //expected-warning@+1{{Argument to free() is the address of the local variable 'i', which is not memory allocated by malloc()}} + std::free(p); +} diff --git a/clang/test/Analysis/malloc.c b/clang/test/Analysis/malloc.c index 09cd4b0bfce638..e5cb45ba733524 100644 --- a/clang/test/Analysis/malloc.c +++ b/clang/test/Analysis/malloc.c @@ -740,6 +740,17 @@ void allocaFree(void) { free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}} } +void allocaFreeBuiltin(void) { + int *p = __builtin_alloca(sizeof(int)); + free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}} +} + +void allocaFreeBuiltinAlign(void) { + int *p = __builtin_alloca_with_align(sizeof(int), 64); + free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}} +} + + int* mallocEscapeRet(void) { int *p = malloc(12); return p; // no warning diff --git a/clang/test/Analysis/malloc.cpp b/clang/test/Analysis/malloc.cpp index 14b4c0576384f2..300b344ab25d69 100644 --- a/clang/test/Analysis/malloc.cpp +++ b/clang/test/Analysis/malloc.cpp @@ -214,3 +214,14 @@ void *realloc(void **ptr, size_t size) { realloc(ptr, size); } // no-crash namespace pr46253_paramty2{ void *realloc(void *ptr, int size) { realloc(ptr, size); } // no-crash } // namespace pr46253_paramty2 + +namespace pr81597 { +struct S {}; +struct T { + void free(const S& s); +}; +void f(T& t) { + S s; + t.free(s); // no-warning: This is not the free you are looking for... +} +} // namespace pr81597 diff --git a/clang/test/Analysis/stack-addr-ps.c b/clang/test/Analysis/stack-addr-ps.c index e469396e1bb22a..e69ab4189b524f 100644 --- a/clang/test/Analysis/stack-addr-ps.c +++ b/clang/test/Analysis/stack-addr-ps.c @@ -1,4 +1,4 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=core -fblocks -verify %s +// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -fblocks -verify %s int* f1(void) { int x = 0; diff --git a/clang/test/Analysis/stackaddrleak.c b/clang/test/Analysis/stackaddrleak.c index 0583bfc18711c5..39c29f2a2635b5 100644 --- a/clang/test/Analysis/stackaddrleak.c +++ b/clang/test/Analysis/stackaddrleak.c @@ -1,5 +1,5 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify -std=c99 -Dbool=_Bool -Wno-bool-conversion %s -// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify -x c++ -Wno-bool-conversion %s +// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -std=c99 -Dbool=_Bool -Wno-bool-conversion %s +// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -x c++ -Wno-bool-conversion %s typedef __INTPTR_TYPE__ intptr_t; char const *p; From 40dd3aa91d3f73184e34e45e597b84bec059c572 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Tue, 16 Apr 2024 10:59:02 +0200 Subject: [PATCH 051/300] [mlir][Interfaces] `Variable` abstraction for `ValueBoundsOpInterface` (#87980) This commit generalizes and cleans up the `ValueBoundsConstraintSet` API. The API used to provide function overloads for comparing/computing bounds of: - index-typed SSA value - dimension of shaped value - affine map + operands This commit removes all overloads. There is now a single entry point for each `compare` variant and each `computeBound` variant. These functions now take a `Variable`, which is internally represented as an affine map and map operands. This commit also adds support for computing bounds for an affine map + operands. There was previously no public API for that. --- .../Dialect/Affine/Transforms/Transforms.h | 11 + .../Dialect/Arith/Transforms/Transforms.h | 11 + .../mlir/Interfaces/ValueBoundsOpInterface.h | 119 +++--- .../Affine/IR/ValueBoundsOpInterfaceImpl.cpp | 6 +- .../Affine/Transforms/ReifyValueBounds.cpp | 15 +- .../Arith/IR/ValueBoundsOpInterfaceImpl.cpp | 8 +- .../Dialect/Arith/Transforms/IntNarrowing.cpp | 2 +- .../Arith/Transforms/ReifyValueBounds.cpp | 15 +- .../lib/Dialect/Linalg/Transforms/Padding.cpp | 6 +- .../Dialect/Linalg/Transforms/Promotion.cpp | 6 +- .../Transforms/IndependenceTransforms.cpp | 5 +- .../SCF/IR/ValueBoundsOpInterfaceImpl.cpp | 17 +- .../Tensor/IR/TensorTilingInterfaceImpl.cpp | 3 +- .../Transforms/IndependenceTransforms.cpp | 3 +- mlir/lib/Dialect/Tensor/Utils/Utils.cpp | 4 +- .../lib/Interfaces/ValueBoundsOpInterface.cpp | 338 ++++++++---------- .../value-bounds-op-interface-impl.mlir | 24 ++ .../Dialect/Affine/TestReifyValueBounds.cpp | 26 +- mlir/test/lib/Dialect/Test/TestDialect.cpp | 37 ++ mlir/test/lib/Dialect/Test/TestOps.td | 16 +- 20 files changed, 361 insertions(+), 311 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h index 8e840e744064d5..1ea73752208156 100644 --- a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h @@ -53,6 +53,17 @@ void reorderOperandsByHoistability(RewriterBase &rewriter, AffineApplyOp op); /// maximally compose chains of AffineApplyOps. FailureOr decompose(RewriterBase &rewriter, AffineApplyOp op); +/// Reify a bound for the given variable in terms of SSA values for which +/// `stopCondition` is met. +/// +/// By default, lower/equal bounds are closed and upper bounds are open. If +/// `closedUB` is set to "true", upper bounds are also closed. +FailureOr +reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type, + const ValueBoundsConstraintSet::Variable &var, + ValueBoundsConstraintSet::StopConditionFn stopCondition, + bool closedUB = false); + /// Reify a bound for the given index-typed value in terms of SSA values for /// which `stopCondition` is met. If no stop condition is specified, reify in /// terms of the operands of the owner op. diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h index 970a52a06a11a2..bbc7e5d3e0dd70 100644 --- a/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h @@ -24,6 +24,17 @@ enum class BoundType; namespace arith { +/// Reify a bound for the given variable in terms of SSA values for which +/// `stopCondition` is met. +/// +/// By default, lower/equal bounds are closed and upper bounds are open. If +/// `closedUB` is set to "true", upper bounds are also closed. +FailureOr +reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type, + const ValueBoundsConstraintSet::Variable &var, + ValueBoundsConstraintSet::StopConditionFn stopCondition, + bool closedUB = false); + /// Reify a bound for the given index-typed value in terms of SSA values for /// which `stopCondition` is met. If no stop condition is specified, reify in /// terms of the operands of the owner op. diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h index 1d7bc6ea961cc3..ac17ace5a976d2 100644 --- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h +++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h @@ -15,6 +15,7 @@ #include "mlir/IR/Value.h" #include "mlir/Interfaces/DestinationStyleOpInterface.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Support/ExtensibleRTTI.h" #include @@ -111,6 +112,39 @@ class ValueBoundsConstraintSet public: static char ID; + /// A variable that can be added to the constraint set as a "column". The + /// value bounds infrastructure can compute bounds for variables and compare + /// two variables. + /// + /// Internally, a variable is represented as an affine map and operands. + class Variable { + public: + /// Construct a variable for an index-typed attribute or SSA value. + Variable(OpFoldResult ofr); + + /// Construct a variable for an index-typed SSA value. + Variable(Value indexValue); + + /// Construct a variable for a dimension of a shaped value. + Variable(Value shapedValue, int64_t dim); + + /// Construct a variable for an index-typed attribute/SSA value or for a + /// dimension of a shaped value. A non-null dimension must be provided if + /// and only if `ofr` is a shaped value. + Variable(OpFoldResult ofr, std::optional dim); + + /// Construct a variable for a map and its operands. + Variable(AffineMap map, ArrayRef mapOperands); + Variable(AffineMap map, ArrayRef mapOperands); + + MLIRContext *getContext() const { return map.getContext(); } + + private: + friend class ValueBoundsConstraintSet; + AffineMap map; + ValueDimList mapOperands; + }; + /// The stop condition when traversing the backward slice of a shaped value/ /// index-type value. The traversal continues until the stop condition /// evaluates to "true" for a value. @@ -121,35 +155,31 @@ class ValueBoundsConstraintSet using StopConditionFn = std::function /*dim*/, ValueBoundsConstraintSet &cstr)>; - /// Compute a bound for the given index-typed value or shape dimension size. - /// The computed bound is stored in `resultMap`. The operands of the bound are - /// stored in `mapOperands`. An operand is either an index-type SSA value - /// or a shaped value and a dimension. + /// Compute a bound for the given variable. The computed bound is stored in + /// `resultMap`. The operands of the bound are stored in `mapOperands`. An + /// operand is either an index-type SSA value or a shaped value and a + /// dimension. /// - /// `dim` must be `nullopt` if and only if `value` is index-typed. The bound - /// is computed in terms of values/dimensions for which `stopCondition` - /// evaluates to "true". To that end, the backward slice (reverse use-def - /// chain) of the given value is visited in a worklist-driven manner and the - /// constraint set is populated according to `ValueBoundsOpInterface` for each - /// visited value. + /// The bound is computed in terms of values/dimensions for which + /// `stopCondition` evaluates to "true". To that end, the backward slice + /// (reverse use-def chain) of the given value is visited in a worklist-driven + /// manner and the constraint set is populated according to + /// `ValueBoundsOpInterface` for each visited value. /// /// By default, lower/equal bounds are closed and upper bounds are open. If /// `closedUB` is set to "true", upper bounds are also closed. - static LogicalResult computeBound(AffineMap &resultMap, - ValueDimList &mapOperands, - presburger::BoundType type, Value value, - std::optional dim, - StopConditionFn stopCondition, - bool closedUB = false); + static LogicalResult + computeBound(AffineMap &resultMap, ValueDimList &mapOperands, + presburger::BoundType type, const Variable &var, + StopConditionFn stopCondition, bool closedUB = false); /// Compute a bound in terms of the values/dimensions in `dependencies`. The /// computed bound consists of only constant terms and dependent values (or /// dimension sizes thereof). static LogicalResult computeDependentBound(AffineMap &resultMap, ValueDimList &mapOperands, - presburger::BoundType type, Value value, - std::optional dim, ValueDimList dependencies, - bool closedUB = false); + presburger::BoundType type, const Variable &var, + ValueDimList dependencies, bool closedUB = false); /// Compute a bound in that is independent of all values in `independencies`. /// @@ -161,13 +191,10 @@ class ValueBoundsConstraintSet /// appear in the computed bound. static LogicalResult computeIndependentBound(AffineMap &resultMap, ValueDimList &mapOperands, - presburger::BoundType type, Value value, - std::optional dim, ValueRange independencies, - bool closedUB = false); + presburger::BoundType type, const Variable &var, + ValueRange independencies, bool closedUB = false); - /// Compute a constant bound for the given affine map, where dims and symbols - /// are bound to the given operands. The affine map must have exactly one - /// result. + /// Compute a constant bound for the given variable. /// /// This function traverses the backward slice of the given operands in a /// worklist-driven manner until `stopCondition` evaluates to "true". The @@ -182,16 +209,9 @@ class ValueBoundsConstraintSet /// By default, lower/equal bounds are closed and upper bounds are open. If /// `closedUB` is set to "true", upper bounds are also closed. static FailureOr - computeConstantBound(presburger::BoundType type, Value value, - std::optional dim = std::nullopt, + computeConstantBound(presburger::BoundType type, const Variable &var, StopConditionFn stopCondition = nullptr, bool closedUB = false); - static FailureOr computeConstantBound( - presburger::BoundType type, AffineMap map, ValueDimList mapOperands, - StopConditionFn stopCondition = nullptr, bool closedUB = false); - static FailureOr computeConstantBound( - presburger::BoundType type, AffineMap map, ArrayRef mapOperands, - StopConditionFn stopCondition = nullptr, bool closedUB = false); /// Compute a constant delta between the given two values. Return "failure" /// if a constant delta could not be determined. @@ -221,9 +241,8 @@ class ValueBoundsConstraintSet /// proven. This could be because the specified relation does in fact not hold /// or because there is not enough information in the constraint set. In other /// words, if we do not know for sure, this function returns "false". - bool populateAndCompare(OpFoldResult lhs, std::optional lhsDim, - ComparisonOperator cmp, OpFoldResult rhs, - std::optional rhsDim); + bool populateAndCompare(const Variable &lhs, ComparisonOperator cmp, + const Variable &rhs); /// Return "true" if "lhs cmp rhs" was proven to hold. Return "false" if the /// specified relation could not be proven. This could be because the @@ -233,24 +252,12 @@ class ValueBoundsConstraintSet /// /// This function keeps traversing the backward slice of lhs/rhs until could /// prove the relation or until it ran out of IR. - static bool compare(OpFoldResult lhs, std::optional lhsDim, - ComparisonOperator cmp, OpFoldResult rhs, - std::optional rhsDim); - static bool compare(AffineMap lhs, ValueDimList lhsOperands, - ComparisonOperator cmp, AffineMap rhs, - ValueDimList rhsOperands); - static bool compare(AffineMap lhs, ArrayRef lhsOperands, - ComparisonOperator cmp, AffineMap rhs, - ArrayRef rhsOperands); - - /// Compute whether the given values/dimensions are equal. Return "failure" if + static bool compare(const Variable &lhs, ComparisonOperator cmp, + const Variable &rhs); + + /// Compute whether the given variables are equal. Return "failure" if /// equality could not be determined. - /// - /// `dim1`/`dim2` must be `nullopt` if and only if `value1`/`value2` are - /// index-typed. - static FailureOr areEqual(OpFoldResult value1, OpFoldResult value2, - std::optional dim1 = std::nullopt, - std::optional dim2 = std::nullopt); + static FailureOr areEqual(const Variable &var1, const Variable &var2); /// Return "true" if the given slices are guaranteed to be overlapping. /// Return "false" if the given slices are guaranteed to be non-overlapping. @@ -317,9 +324,6 @@ class ValueBoundsConstraintSet /// /// This function does not analyze any IR and does not populate any additional /// constraints. - bool compareValueDims(OpFoldResult lhs, std::optional lhsDim, - ComparisonOperator cmp, OpFoldResult rhs, - std::optional rhsDim); bool comparePos(int64_t lhsPos, ComparisonOperator cmp, int64_t rhsPos); /// Given an affine map with a single result (and map operands), add a new @@ -374,6 +378,7 @@ class ValueBoundsConstraintSet /// constraint system. Return the position of the new column. Any operands /// that were not analyzed yet are put on the worklist. int64_t insert(AffineMap map, ValueDimList operands, bool isSymbol = true); + int64_t insert(const Variable &var, bool isSymbol = true); /// Project out the given column in the constraint set. void projectOut(int64_t pos); @@ -381,6 +386,8 @@ class ValueBoundsConstraintSet /// Project out all columns for which the condition holds. void projectOut(function_ref condition); + void projectOutAnonymous(std::optional except = std::nullopt); + /// Mapping of columns to values/shape dimensions. SmallVector> positionToValueDim; /// Reverse mapping of values/shape dimensions to columns. diff --git a/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp index e0c3abe7a0f71d..82a9fb0d490882 100644 --- a/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp @@ -120,9 +120,7 @@ mlir::affine::fullyComposeAndComputeConstantDelta(Value value1, Value value2) { mapOperands.push_back(value1); mapOperands.push_back(value2); affine::fullyComposeAffineMapAndOperands(&map, &mapOperands); - ValueDimList valueDims; - for (Value v : mapOperands) - valueDims.push_back({v, std::nullopt}); return ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType::EQ, map, valueDims); + presburger::BoundType::EQ, + ValueBoundsConstraintSet::Variable(map, mapOperands)); } diff --git a/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp index 117ee8e8701ad7..1a266b72d1f8d3 100644 --- a/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp @@ -16,16 +16,15 @@ using namespace mlir; using namespace mlir::affine; -static FailureOr -reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type, - Value value, std::optional dim, - ValueBoundsConstraintSet::StopConditionFn stopCondition, - bool closedUB) { +FailureOr mlir::affine::reifyValueBound( + OpBuilder &b, Location loc, presburger::BoundType type, + const ValueBoundsConstraintSet::Variable &var, + ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) { // Compute bound. AffineMap boundMap; ValueDimList mapOperands; if (failed(ValueBoundsConstraintSet::computeBound( - boundMap, mapOperands, type, value, dim, stopCondition, closedUB))) + boundMap, mapOperands, type, var, stopCondition, closedUB))) return failure(); // Reify bound. @@ -93,7 +92,7 @@ FailureOr mlir::affine::reifyShapedValueDimBound( // the owner of `value`. return v != value; }; - return reifyValueBound(b, loc, type, value, dim, + return reifyValueBound(b, loc, type, {value, dim}, stopCondition ? stopCondition : reifyToOperands, closedUB); } @@ -105,7 +104,7 @@ FailureOr mlir::affine::reifyIndexValueBound( ValueBoundsConstraintSet &cstr) { return v != value; }; - return reifyValueBound(b, loc, type, value, /*dim=*/std::nullopt, + return reifyValueBound(b, loc, type, value, stopCondition ? stopCondition : reifyToOperands, closedUB); } diff --git a/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp index f0d43808bc45df..7cfcc4180539c2 100644 --- a/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp @@ -107,9 +107,9 @@ struct SelectOpInterface // If trueValue <= falseValue: // * result <= falseValue // * result >= trueValue - if (cstr.compare(trueValue, dim, + if (cstr.compare(/*lhs=*/{trueValue, dim}, ValueBoundsConstraintSet::ComparisonOperator::LE, - falseValue, dim)) { + /*rhs=*/{falseValue, dim})) { if (dim) { cstr.bound(value)[*dim] >= cstr.getExpr(trueValue, dim); cstr.bound(value)[*dim] <= cstr.getExpr(falseValue, dim); @@ -121,9 +121,9 @@ struct SelectOpInterface // If falseValue <= trueValue: // * result <= trueValue // * result >= falseValue - if (cstr.compare(falseValue, dim, + if (cstr.compare(/*lhs=*/{falseValue, dim}, ValueBoundsConstraintSet::ComparisonOperator::LE, - trueValue, dim)) { + /*rhs=*/{trueValue, dim})) { if (dim) { cstr.bound(value)[*dim] >= cstr.getExpr(falseValue, dim); cstr.bound(value)[*dim] <= cstr.getExpr(trueValue, dim); diff --git a/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp b/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp index 79fabd6ed2e99a..f87f3d6350c022 100644 --- a/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp @@ -449,7 +449,7 @@ struct IndexCastPattern final : NarrowingPattern { return failure(); FailureOr ub = ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType::UB, in, /*dim=*/std::nullopt, + presburger::BoundType::UB, in, /*stopCondition=*/nullptr, /*closedUB=*/true); if (failed(ub)) return failure(); diff --git a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp index fad221288f190e..5fb7953f937007 100644 --- a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp @@ -61,16 +61,15 @@ static Value buildArithValue(OpBuilder &b, Location loc, AffineMap map, return buildExpr(map.getResult(0)); } -static FailureOr -reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type, - Value value, std::optional dim, - ValueBoundsConstraintSet::StopConditionFn stopCondition, - bool closedUB) { +FailureOr mlir::arith::reifyValueBound( + OpBuilder &b, Location loc, presburger::BoundType type, + const ValueBoundsConstraintSet::Variable &var, + ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) { // Compute bound. AffineMap boundMap; ValueDimList mapOperands; if (failed(ValueBoundsConstraintSet::computeBound( - boundMap, mapOperands, type, value, dim, stopCondition, closedUB))) + boundMap, mapOperands, type, var, stopCondition, closedUB))) return failure(); // Materialize tensor.dim/memref.dim ops. @@ -128,7 +127,7 @@ FailureOr mlir::arith::reifyShapedValueDimBound( // the owner of `value`. return v != value; }; - return reifyValueBound(b, loc, type, value, dim, + return reifyValueBound(b, loc, type, {value, dim}, stopCondition ? stopCondition : reifyToOperands, closedUB); } @@ -140,7 +139,7 @@ FailureOr mlir::arith::reifyIndexValueBound( ValueBoundsConstraintSet &cstr) { return v != value; }; - return reifyValueBound(b, loc, type, value, /*dim=*/std::nullopt, + return reifyValueBound(b, loc, type, value, stopCondition ? stopCondition : reifyToOperands, closedUB); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp index 8c4b70db248989..518d2e138c02a9 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp @@ -72,8 +72,10 @@ static LogicalResult computePaddedShape(linalg::LinalgOp opToPad, // Otherwise, try to compute a constant upper bound for the size value. FailureOr upperBound = ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType::UB, opOperand->get(), - /*dim=*/i, /*stopCondition=*/nullptr, /*closedUB=*/true); + presburger::BoundType::UB, + {opOperand->get(), + /*dim=*/i}, + /*stopCondition=*/nullptr, /*closedUB=*/true); if (failed(upperBound)) { LLVM_DEBUG(DBGS() << "----could not compute a bounding box for padding"); return failure(); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp index ac896d6c30d049..71eb59d40836c1 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp @@ -257,14 +257,12 @@ FailureOr mlir::linalg::promoteSubviewAsNewBuffer( if (auto attr = llvm::dyn_cast_if_present(rangeValue.size)) { size = getValueOrCreateConstantIndexOp(b, loc, rangeValue.size); } else { - Value materializedSize = - getValueOrCreateConstantIndexOp(b, loc, rangeValue.size); FailureOr upperBound = ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType::UB, materializedSize, /*dim=*/std::nullopt, + presburger::BoundType::UB, rangeValue.size, /*stopCondition=*/nullptr, /*closedUB=*/true); size = failed(upperBound) - ? materializedSize + ? getValueOrCreateConstantIndexOp(b, loc, rangeValue.size) : b.create(loc, *upperBound); } LLVM_DEBUG(llvm::dbgs() << "Extracted tightest: " << size << "\n"); diff --git a/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp b/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp index 10ba508265e7b9..1f06318cbd60e0 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp @@ -23,12 +23,11 @@ static FailureOr makeIndependent(OpBuilder &b, Location loc, ValueRange independencies) { if (ofr.is()) return ofr; - Value value = ofr.get(); AffineMap boundMap; ValueDimList mapOperands; if (failed(ValueBoundsConstraintSet::computeIndependentBound( - boundMap, mapOperands, presburger::BoundType::UB, value, - /*dim=*/std::nullopt, independencies, /*closedUB=*/true))) + boundMap, mapOperands, presburger::BoundType::UB, ofr, independencies, + /*closedUB=*/true))) return failure(); return affine::materializeComputedBound(b, loc, boundMap, mapOperands); } diff --git a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp index 087ffc438a830a..17a1c016ea16d5 100644 --- a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp @@ -61,12 +61,13 @@ struct ForOpInterface // An EQ constraint can be added if the yielded value (dimension size) // equals the corresponding block argument (dimension size). if (cstr.populateAndCompare( - yieldedValue, dim, ValueBoundsConstraintSet::ComparisonOperator::EQ, - iterArg, dim)) { + /*lhs=*/{yieldedValue, dim}, + ValueBoundsConstraintSet::ComparisonOperator::EQ, + /*rhs=*/{iterArg, dim})) { if (dim.has_value()) { cstr.bound(value)[*dim] == cstr.getExpr(initArg, dim); } else { - cstr.bound(value) == initArg; + cstr.bound(value) == cstr.getExpr(initArg); } } } @@ -113,8 +114,9 @@ struct IfOpInterface // * result <= elseValue // * result >= thenValue if (cstr.populateAndCompare( - thenValue, dim, ValueBoundsConstraintSet::ComparisonOperator::LE, - elseValue, dim)) { + /*lhs=*/{thenValue, dim}, + ValueBoundsConstraintSet::ComparisonOperator::LE, + /*rhs=*/{elseValue, dim})) { if (dim) { cstr.bound(value)[*dim] >= cstr.getExpr(thenValue, dim); cstr.bound(value)[*dim] <= cstr.getExpr(elseValue, dim); @@ -127,8 +129,9 @@ struct IfOpInterface // * result <= thenValue // * result >= elseValue if (cstr.populateAndCompare( - elseValue, dim, ValueBoundsConstraintSet::ComparisonOperator::LE, - thenValue, dim)) { + /*lhs=*/{elseValue, dim}, + ValueBoundsConstraintSet::ComparisonOperator::LE, + /*rhs=*/{thenValue, dim})) { if (dim) { cstr.bound(value)[*dim] >= cstr.getExpr(elseValue, dim); cstr.bound(value)[*dim] <= cstr.getExpr(thenValue, dim); diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp index 67080d8e301c13..d25efcf50ec566 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp @@ -289,8 +289,7 @@ static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp, info.isAlignedToInnerTileSize = false; FailureOr cstSize = ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType::UB, - getValueOrCreateConstantIndexOp(b, loc, tileSize), /*dim=*/std::nullopt, + presburger::BoundType::UB, tileSize, /*stopCondition=*/nullptr, /*closedUB=*/true); std::optional cstInnerSize = getConstantIntValue(innerTileSize); if (!failed(cstSize) && cstInnerSize) { diff --git a/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp b/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp index 721730862d49b3..a89ce20048dff3 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp @@ -28,7 +28,8 @@ static FailureOr makeIndependent(OpBuilder &b, Location loc, ValueDimList mapOperands; if (failed(ValueBoundsConstraintSet::computeIndependentBound( boundMap, mapOperands, presburger::BoundType::UB, value, - /*dim=*/std::nullopt, independencies, /*closedUB=*/true))) + independencies, + /*closedUB=*/true))) return failure(); return mlir::affine::materializeComputedBound(b, loc, boundMap, mapOperands); } diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp index 2dd91e2f7a1700..15381ec520e211 100644 --- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp @@ -154,7 +154,7 @@ bool mlir::tensor::isCastLikeInsertSliceOp(InsertSliceOp op) { continue; } FailureOr equalDimSize = ValueBoundsConstraintSet::areEqual( - op.getSource(), op.getResult(), srcDim, resultDim); + {op.getSource(), srcDim}, {op.getResult(), resultDim}); if (failed(equalDimSize) || !*equalDimSize) return false; ++srcDim; @@ -178,7 +178,7 @@ bool mlir::tensor::isCastLikeExtractSliceOp(ExtractSliceOp op) { continue; } FailureOr equalDimSize = ValueBoundsConstraintSet::areEqual( - op.getSource(), op.getResult(), dim, resultDim); + {op.getSource(), dim}, {op.getResult(), resultDim}); if (failed(equalDimSize) || !*equalDimSize) return false; ++resultDim; diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp index ffa4c0b55cad7c..87937591e60ad8 100644 --- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp +++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp @@ -25,6 +25,12 @@ namespace mlir { #include "mlir/Interfaces/ValueBoundsOpInterface.cpp.inc" } // namespace mlir +static Operation *getOwnerOfValue(Value value) { + if (auto bbArg = dyn_cast(value)) + return bbArg.getOwner()->getParentOp(); + return value.getDefiningOp(); +} + HyperrectangularSlice::HyperrectangularSlice(ArrayRef offsets, ArrayRef sizes, ArrayRef strides) @@ -67,6 +73,83 @@ static std::optional getConstantIntValue(OpFoldResult ofr) { return std::nullopt; } +ValueBoundsConstraintSet::Variable::Variable(OpFoldResult ofr) + : Variable(ofr, std::nullopt) {} + +ValueBoundsConstraintSet::Variable::Variable(Value indexValue) + : Variable(static_cast(indexValue)) {} + +ValueBoundsConstraintSet::Variable::Variable(Value shapedValue, int64_t dim) + : Variable(static_cast(shapedValue), std::optional(dim)) {} + +ValueBoundsConstraintSet::Variable::Variable(OpFoldResult ofr, + std::optional dim) { + Builder b(ofr.getContext()); + if (auto constInt = ::getConstantIntValue(ofr)) { + assert(!dim && "expected no dim for index-typed values"); + map = AffineMap::get(/*dimCount=*/0, /*symbolCount=*/0, + b.getAffineConstantExpr(*constInt)); + return; + } + Value value = cast(ofr); +#ifndef NDEBUG + if (dim) { + assert(isa(value.getType()) && "expected shaped type"); + } else { + assert(value.getType().isIndex() && "expected index type"); + } +#endif // NDEBUG + map = AffineMap::get(/*dimCount=*/0, /*symbolCount=*/1, + b.getAffineSymbolExpr(0)); + mapOperands.emplace_back(value, dim); +} + +ValueBoundsConstraintSet::Variable::Variable(AffineMap map, + ArrayRef mapOperands) { + assert(map.getNumResults() == 1 && "expected single result"); + + // Turn all dims into symbols. + Builder b(map.getContext()); + SmallVector dimReplacements, symReplacements; + for (int64_t i = 0, e = map.getNumDims(); i < e; ++i) + dimReplacements.push_back(b.getAffineSymbolExpr(i)); + for (int64_t i = 0, e = map.getNumSymbols(); i < e; ++i) + symReplacements.push_back(b.getAffineSymbolExpr(i + map.getNumDims())); + AffineMap tmpMap = map.replaceDimsAndSymbols( + dimReplacements, symReplacements, /*numResultDims=*/0, + /*numResultSyms=*/map.getNumSymbols() + map.getNumDims()); + + // Inline operands. + DenseMap replacements; + for (auto [index, var] : llvm::enumerate(mapOperands)) { + assert(var.map.getNumResults() == 1 && "expected single result"); + assert(var.map.getNumDims() == 0 && "expected only symbols"); + SmallVector symReplacements; + for (auto valueDim : var.mapOperands) { + auto it = llvm::find(this->mapOperands, valueDim); + if (it != this->mapOperands.end()) { + // There is already a symbol for this operand. + symReplacements.push_back(b.getAffineSymbolExpr( + std::distance(this->mapOperands.begin(), it))); + } else { + // This is a new operand: add a new symbol. + symReplacements.push_back( + b.getAffineSymbolExpr(this->mapOperands.size())); + this->mapOperands.push_back(valueDim); + } + } + replacements[b.getAffineSymbolExpr(index)] = + var.map.getResult(0).replaceSymbols(symReplacements); + } + this->map = tmpMap.replace(replacements, /*numResultDims=*/0, + /*numResultSyms=*/this->mapOperands.size()); +} + +ValueBoundsConstraintSet::Variable::Variable(AffineMap map, + ArrayRef mapOperands) + : Variable(map, llvm::map_to_vector(mapOperands, + [](Value v) { return Variable(v); })) {} + ValueBoundsConstraintSet::ValueBoundsConstraintSet( MLIRContext *ctx, StopConditionFn stopCondition) : builder(ctx), stopCondition(stopCondition) { @@ -176,6 +259,11 @@ int64_t ValueBoundsConstraintSet::insert(Value value, assert(!valueDimToPosition.contains(valueDim) && "already mapped"); int64_t pos = isSymbol ? cstr.appendVar(VarKind::Symbol) : cstr.appendVar(VarKind::SetDim); + LLVM_DEBUG(llvm::dbgs() << "Inserting constraint set column " << pos + << " for: " << value + << " (dim: " << dim.value_or(kIndexValue) + << ", owner: " << getOwnerOfValue(value)->getName() + << ")\n"); positionToValueDim.insert(positionToValueDim.begin() + pos, valueDim); // Update reverse mapping. for (int64_t i = pos, e = positionToValueDim.size(); i < e; ++i) @@ -194,6 +282,8 @@ int64_t ValueBoundsConstraintSet::insert(Value value, int64_t ValueBoundsConstraintSet::insert(bool isSymbol) { int64_t pos = isSymbol ? cstr.appendVar(VarKind::Symbol) : cstr.appendVar(VarKind::SetDim); + LLVM_DEBUG(llvm::dbgs() << "Inserting anonymous constraint set column " << pos + << "\n"); positionToValueDim.insert(positionToValueDim.begin() + pos, std::nullopt); // Update reverse mapping. for (int64_t i = pos, e = positionToValueDim.size(); i < e; ++i) @@ -224,6 +314,10 @@ int64_t ValueBoundsConstraintSet::insert(AffineMap map, ValueDimList operands, return pos; } +int64_t ValueBoundsConstraintSet::insert(const Variable &var, bool isSymbol) { + return insert(var.map, var.mapOperands, isSymbol); +} + int64_t ValueBoundsConstraintSet::getPos(Value value, std::optional dim) const { #ifndef NDEBUG @@ -232,7 +326,10 @@ int64_t ValueBoundsConstraintSet::getPos(Value value, cast(value).getOwner()->isEntryBlock()) && "unstructured control flow is not supported"); #endif // NDEBUG - + LLVM_DEBUG(llvm::dbgs() << "Getting pos for: " << value + << " (dim: " << dim.value_or(kIndexValue) + << ", owner: " << getOwnerOfValue(value)->getName() + << ")\n"); auto it = valueDimToPosition.find(std::make_pair(value, dim.value_or(kIndexValue))); assert(it != valueDimToPosition.end() && "expected mapped entry"); @@ -253,12 +350,6 @@ bool ValueBoundsConstraintSet::isMapped(Value value, return it != valueDimToPosition.end(); } -static Operation *getOwnerOfValue(Value value) { - if (auto bbArg = dyn_cast(value)) - return bbArg.getOwner()->getParentOp(); - return value.getDefiningOp(); -} - void ValueBoundsConstraintSet::processWorklist() { LLVM_DEBUG(llvm::dbgs() << "Processing value bounds worklist...\n"); while (!worklist.empty()) { @@ -346,41 +437,47 @@ void ValueBoundsConstraintSet::projectOut( } } +void ValueBoundsConstraintSet::projectOutAnonymous( + std::optional except) { + int64_t nextPos = 0; + while (nextPos < static_cast(positionToValueDim.size())) { + if (positionToValueDim[nextPos].has_value() || except == nextPos) { + ++nextPos; + } else { + projectOut(nextPos); + // The column was projected out so another column is now at that position. + // Do not increase the counter. + } + } +} + LogicalResult ValueBoundsConstraintSet::computeBound( AffineMap &resultMap, ValueDimList &mapOperands, presburger::BoundType type, - Value value, std::optional dim, StopConditionFn stopCondition, - bool closedUB) { -#ifndef NDEBUG - assertValidValueDim(value, dim); -#endif // NDEBUG - + const Variable &var, StopConditionFn stopCondition, bool closedUB) { + MLIRContext *ctx = var.getContext(); int64_t ubAdjustment = closedUB ? 0 : 1; - Builder b(value.getContext()); + Builder b(ctx); mapOperands.clear(); // Process the backward slice of `value` (i.e., reverse use-def chain) until // `stopCondition` is met. - ValueDim valueDim = std::make_pair(value, dim.value_or(kIndexValue)); - ValueBoundsConstraintSet cstr(value.getContext(), stopCondition); - assert(!stopCondition(value, dim, cstr) && - "stop condition should not be satisfied for starting point"); - int64_t pos = cstr.insert(value, dim, /*isSymbol=*/false); + ValueBoundsConstraintSet cstr(ctx, stopCondition); + int64_t pos = cstr.insert(var, /*isSymbol=*/false); + assert(pos == 0 && "expected first column"); cstr.processWorklist(); // Project out all variables (apart from `valueDim`) that do not match the // stop condition. cstr.projectOut([&](ValueDim p) { - // Do not project out `valueDim`. - if (valueDim == p) - return false; auto maybeDim = p.second == kIndexValue ? std::nullopt : std::make_optional(p.second); return !stopCondition(p.first, maybeDim, cstr); }); + cstr.projectOutAnonymous(/*except=*/pos); // Compute lower and upper bounds for `valueDim`. SmallVector lb(1), ub(1); - cstr.cstr.getSliceBounds(pos, 1, value.getContext(), &lb, &ub, + cstr.cstr.getSliceBounds(pos, 1, ctx, &lb, &ub, /*closedUB=*/true); // Note: There are TODOs in the implementation of `getSliceBounds`. In such a @@ -477,10 +574,9 @@ LogicalResult ValueBoundsConstraintSet::computeBound( LogicalResult ValueBoundsConstraintSet::computeDependentBound( AffineMap &resultMap, ValueDimList &mapOperands, presburger::BoundType type, - Value value, std::optional dim, ValueDimList dependencies, - bool closedUB) { + const Variable &var, ValueDimList dependencies, bool closedUB) { return computeBound( - resultMap, mapOperands, type, value, dim, + resultMap, mapOperands, type, var, [&](Value v, std::optional d, ValueBoundsConstraintSet &cstr) { return llvm::is_contained(dependencies, std::make_pair(v, d)); }, @@ -489,8 +585,7 @@ LogicalResult ValueBoundsConstraintSet::computeDependentBound( LogicalResult ValueBoundsConstraintSet::computeIndependentBound( AffineMap &resultMap, ValueDimList &mapOperands, presburger::BoundType type, - Value value, std::optional dim, ValueRange independencies, - bool closedUB) { + const Variable &var, ValueRange independencies, bool closedUB) { // Return "true" if the given value is independent of all values in // `independencies`. I.e., neither the value itself nor any value in the // backward slice (reverse use-def chain) is contained in `independencies`. @@ -516,7 +611,7 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound( // Reify bounds in terms of any independent values. return computeBound( - resultMap, mapOperands, type, value, dim, + resultMap, mapOperands, type, var, [&](Value v, std::optional d, ValueBoundsConstraintSet &cstr) { return isIndependent(v); }, @@ -524,35 +619,8 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound( } FailureOr ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType type, Value value, std::optional dim, - StopConditionFn stopCondition, bool closedUB) { -#ifndef NDEBUG - assertValidValueDim(value, dim); -#endif // NDEBUG - - AffineMap map = - AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, - Builder(value.getContext()).getAffineDimExpr(0)); - return computeConstantBound(type, map, {{value, dim}}, stopCondition, - closedUB); -} - -FailureOr ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType type, AffineMap map, ArrayRef operands, + presburger::BoundType type, const Variable &var, StopConditionFn stopCondition, bool closedUB) { - ValueDimList valueDims; - for (Value v : operands) { - assert(v.getType().isIndex() && "expected index type"); - valueDims.emplace_back(v, std::nullopt); - } - return computeConstantBound(type, map, valueDims, stopCondition, closedUB); -} - -FailureOr ValueBoundsConstraintSet::computeConstantBound( - presburger::BoundType type, AffineMap map, ValueDimList operands, - StopConditionFn stopCondition, bool closedUB) { - assert(map.getNumResults() == 1 && "expected affine map with one result"); - // Default stop condition if none was specified: Keep adding constraints until // a bound could be computed. int64_t pos = 0; @@ -562,8 +630,8 @@ FailureOr ValueBoundsConstraintSet::computeConstantBound( }; ValueBoundsConstraintSet cstr( - map.getContext(), stopCondition ? stopCondition : defaultStopCondition); - pos = cstr.populateConstraints(map, operands); + var.getContext(), stopCondition ? stopCondition : defaultStopCondition); + pos = cstr.populateConstraints(var.map, var.mapOperands); assert(pos == 0 && "expected `map` is the first column"); // Compute constant bound for `valueDim`. @@ -608,22 +676,13 @@ ValueBoundsConstraintSet::computeConstantDelta(Value value1, Value value2, Builder b(value1.getContext()); AffineMap map = AffineMap::get(/*dimCount=*/2, /*symbolCount=*/0, b.getAffineDimExpr(0) - b.getAffineDimExpr(1)); - return computeConstantBound(presburger::BoundType::EQ, map, - {{value1, dim1}, {value2, dim2}}); + return computeConstantBound(presburger::BoundType::EQ, + Variable(map, {{value1, dim1}, {value2, dim2}})); } -bool ValueBoundsConstraintSet::compareValueDims(OpFoldResult lhs, - std::optional lhsDim, - ComparisonOperator cmp, - OpFoldResult rhs, - std::optional rhsDim) { -#ifndef NDEBUG - if (auto lhsVal = dyn_cast(lhs)) - assertValidValueDim(lhsVal, lhsDim); - if (auto rhsVal = dyn_cast(rhs)) - assertValidValueDim(rhsVal, rhsDim); -#endif // NDEBUG - +bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos, + ComparisonOperator cmp, + int64_t rhsPos) { // This function returns "true" if "lhs CMP rhs" is proven to hold. // // Example for ComparisonOperator::LE and index-typed values: We would like to @@ -640,50 +699,6 @@ bool ValueBoundsConstraintSet::compareValueDims(OpFoldResult lhs, return false; } - // EQ can be expressed as LE and GE. - if (cmp == EQ) - return compareValueDims(lhs, lhsDim, ComparisonOperator::LE, rhs, rhsDim) && - compareValueDims(lhs, lhsDim, ComparisonOperator::GE, rhs, rhsDim); - - // Construct inequality. For the above example: lhs > rhs. - // `IntegerRelation` inequalities are expressed in the "flattened" form and - // with ">= 0". I.e., lhs - rhs - 1 >= 0. - SmallVector eq(cstr.getNumCols(), 0); - auto addToEq = [&](OpFoldResult ofr, std::optional dim, - int64_t factor) { - if (auto constVal = ::getConstantIntValue(ofr)) { - eq[cstr.getNumCols() - 1] += *constVal * factor; - } else { - eq[getPos(cast(ofr), dim)] += factor; - } - }; - if (cmp == LT || cmp == LE) { - addToEq(lhs, lhsDim, 1); - addToEq(rhs, rhsDim, -1); - } else if (cmp == GT || cmp == GE) { - addToEq(lhs, lhsDim, -1); - addToEq(rhs, rhsDim, 1); - } else { - llvm_unreachable("unsupported comparison operator"); - } - if (cmp == LE || cmp == GE) - eq[cstr.getNumCols() - 1] -= 1; - - // Add inequality to the constraint set and check if it made the constraint - // set empty. - int64_t ineqPos = cstr.getNumInequalities(); - cstr.addInequality(eq); - bool isEmpty = cstr.isEmpty(); - cstr.removeInequality(ineqPos); - return isEmpty; -} - -bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos, - ComparisonOperator cmp, - int64_t rhsPos) { - // This function returns "true" if "lhs CMP rhs" is proven to hold. For - // detailed documentation, see `compareValueDims`. - // EQ can be expressed as LE and GE. if (cmp == EQ) return comparePos(lhsPos, ComparisonOperator::LE, rhsPos) && @@ -712,48 +727,17 @@ bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos, return isEmpty; } -bool ValueBoundsConstraintSet::populateAndCompare( - OpFoldResult lhs, std::optional lhsDim, ComparisonOperator cmp, - OpFoldResult rhs, std::optional rhsDim) { -#ifndef NDEBUG - if (auto lhsVal = dyn_cast(lhs)) - assertValidValueDim(lhsVal, lhsDim); - if (auto rhsVal = dyn_cast(rhs)) - assertValidValueDim(rhsVal, rhsDim); -#endif // NDEBUG - - if (auto lhsVal = dyn_cast(lhs)) - populateConstraints(lhsVal, lhsDim); - if (auto rhsVal = dyn_cast(rhs)) - populateConstraints(rhsVal, rhsDim); - - return compareValueDims(lhs, lhsDim, cmp, rhs, rhsDim); +bool ValueBoundsConstraintSet::populateAndCompare(const Variable &lhs, + ComparisonOperator cmp, + const Variable &rhs) { + int64_t lhsPos = populateConstraints(lhs.map, lhs.mapOperands); + int64_t rhsPos = populateConstraints(rhs.map, rhs.mapOperands); + return comparePos(lhsPos, cmp, rhsPos); } -bool ValueBoundsConstraintSet::compare(OpFoldResult lhs, - std::optional lhsDim, - ComparisonOperator cmp, OpFoldResult rhs, - std::optional rhsDim) { - auto stopCondition = [&](Value v, std::optional dim, - ValueBoundsConstraintSet &cstr) { - // Keep processing as long as lhs/rhs are not mapped. - if (auto lhsVal = dyn_cast(lhs)) - if (!cstr.isMapped(lhsVal, dim)) - return false; - if (auto rhsVal = dyn_cast(rhs)) - if (!cstr.isMapped(rhsVal, dim)) - return false; - // Keep processing as long as the relation cannot be proven. - return cstr.compareValueDims(lhs, lhsDim, cmp, rhs, rhsDim); - }; - - ValueBoundsConstraintSet cstr(lhs.getContext(), stopCondition); - return cstr.populateAndCompare(lhs, lhsDim, cmp, rhs, rhsDim); -} - -bool ValueBoundsConstraintSet::compare(AffineMap lhs, ValueDimList lhsOperands, - ComparisonOperator cmp, AffineMap rhs, - ValueDimList rhsOperands) { +bool ValueBoundsConstraintSet::compare(const Variable &lhs, + ComparisonOperator cmp, + const Variable &rhs) { int64_t lhsPos = -1, rhsPos = -1; auto stopCondition = [&](Value v, std::optional dim, ValueBoundsConstraintSet &cstr) { @@ -765,39 +749,17 @@ bool ValueBoundsConstraintSet::compare(AffineMap lhs, ValueDimList lhsOperands, return cstr.comparePos(lhsPos, cmp, rhsPos); }; ValueBoundsConstraintSet cstr(lhs.getContext(), stopCondition); - lhsPos = cstr.insert(lhs, lhsOperands); - rhsPos = cstr.insert(rhs, rhsOperands); - cstr.processWorklist(); + lhsPos = cstr.populateConstraints(lhs.map, lhs.mapOperands); + rhsPos = cstr.populateConstraints(rhs.map, rhs.mapOperands); return cstr.comparePos(lhsPos, cmp, rhsPos); } -bool ValueBoundsConstraintSet::compare(AffineMap lhs, - ArrayRef lhsOperands, - ComparisonOperator cmp, AffineMap rhs, - ArrayRef rhsOperands) { - ValueDimList lhsValueDimOperands = - llvm::map_to_vector(lhsOperands, [](Value v) { - return std::make_pair(v, std::optional()); - }); - ValueDimList rhsValueDimOperands = - llvm::map_to_vector(rhsOperands, [](Value v) { - return std::make_pair(v, std::optional()); - }); - return ValueBoundsConstraintSet::compare(lhs, lhsValueDimOperands, cmp, rhs, - rhsValueDimOperands); -} - -FailureOr -ValueBoundsConstraintSet::areEqual(OpFoldResult value1, OpFoldResult value2, - std::optional dim1, - std::optional dim2) { - if (ValueBoundsConstraintSet::compare(value1, dim1, ComparisonOperator::EQ, - value2, dim2)) +FailureOr ValueBoundsConstraintSet::areEqual(const Variable &var1, + const Variable &var2) { + if (ValueBoundsConstraintSet::compare(var1, ComparisonOperator::EQ, var2)) return true; - if (ValueBoundsConstraintSet::compare(value1, dim1, ComparisonOperator::LT, - value2, dim2) || - ValueBoundsConstraintSet::compare(value1, dim1, ComparisonOperator::GT, - value2, dim2)) + if (ValueBoundsConstraintSet::compare(var1, ComparisonOperator::LT, var2) || + ValueBoundsConstraintSet::compare(var1, ComparisonOperator::GT, var2)) return false; return failure(); } @@ -833,7 +795,7 @@ ValueBoundsConstraintSet::areOverlappingSlices(MLIRContext *ctx, AffineMap foldedMap = foldAttributesIntoMap(b, map, ofrOperands, valueOperands); FailureOr constBound = computeConstantBound( - presburger::BoundType::EQ, foldedMap, valueOperands); + presburger::BoundType::EQ, Variable(foldedMap, valueOperands)); foundUnknownBound |= failed(constBound); if (succeeded(constBound) && *constBound <= 0) return false; @@ -850,7 +812,7 @@ ValueBoundsConstraintSet::areOverlappingSlices(MLIRContext *ctx, AffineMap foldedMap = foldAttributesIntoMap(b, map, ofrOperands, valueOperands); FailureOr constBound = computeConstantBound( - presburger::BoundType::EQ, foldedMap, valueOperands); + presburger::BoundType::EQ, Variable(foldedMap, valueOperands)); foundUnknownBound |= failed(constBound); if (succeeded(constBound) && *constBound <= 0) return false; diff --git a/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir index 23c6872dcebe94..935c08aceff548 100644 --- a/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir +++ b/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir @@ -131,3 +131,27 @@ func.func @compare_affine_min(%a: index, %b: index) { "test.compare"(%0, %a) {cmp = "LE"} : (index, index) -> () return } + +// ----- + +func.func @compare_const_map() { + %c5 = arith.constant 5 : index + // expected-remark @below{{true}} + "test.compare"(%c5) {cmp = "GT", rhs_map = affine_map<() -> (4)>} + : (index) -> () + // expected-remark @below{{true}} + "test.compare"(%c5) {cmp = "LT", lhs_map = affine_map<() -> (4)>} + : (index) -> () + return +} + +// ----- + +func.func @compare_maps(%a: index, %b: index) { + // expected-remark @below{{true}} + "test.compare"(%a, %b, %b, %a) + {cmp = "GT", lhs_map = affine_map<(d0, d1) -> (1 + d0 + d1)>, + rhs_map = affine_map<(d0, d1) -> (d0 + d1)>} + : (index, index, index, index) -> () + return +} diff --git a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp index 6730f9b292ad93..b098a5a23fd316 100644 --- a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp +++ b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp @@ -109,7 +109,7 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp, FailureOr reified = failure(); if (constant) { auto reifiedConst = ValueBoundsConstraintSet::computeConstantBound( - boundType, value, dim, /*stopCondition=*/nullptr); + boundType, {value, dim}, /*stopCondition=*/nullptr); if (succeeded(reifiedConst)) reified = FailureOr(rewriter.getIndexAttr(*reifiedConst)); } else if (scalable) { @@ -128,22 +128,12 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp, rewriter, loc, reifiedScalable->map, vscaleOperand); } } else { - if (dim) { - if (useArithOps) { - reified = arith::reifyShapedValueDimBound( - rewriter, op->getLoc(), boundType, value, *dim, stopCondition); - } else { - reified = reifyShapedValueDimBound(rewriter, op->getLoc(), boundType, - value, *dim, stopCondition); - } + if (useArithOps) { + reified = arith::reifyValueBound(rewriter, op->getLoc(), boundType, + op.getVariable(), stopCondition); } else { - if (useArithOps) { - reified = arith::reifyIndexValueBound( - rewriter, op->getLoc(), boundType, value, stopCondition); - } else { - reified = reifyIndexValueBound(rewriter, op->getLoc(), boundType, - value, stopCondition); - } + reified = reifyValueBound(rewriter, op->getLoc(), boundType, + op.getVariable(), stopCondition); } } if (failed(reified)) { @@ -188,9 +178,7 @@ static LogicalResult testEquality(func::FuncOp funcOp) { } auto compare = [&](ValueBoundsConstraintSet::ComparisonOperator cmp) { - return ValueBoundsConstraintSet::compare( - /*lhs=*/op.getLhs(), /*lhsDim=*/std::nullopt, cmp, - /*rhs=*/op.getRhs(), /*rhsDim=*/std::nullopt); + return ValueBoundsConstraintSet::compare(op.getLhs(), cmp, op.getRhs()); }; if (compare(cmpType)) { op->emitRemark("true"); diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp index 25c5190ca0ef3a..36d7606fe1345b 100644 --- a/mlir/test/lib/Dialect/Test/TestDialect.cpp +++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp @@ -549,6 +549,12 @@ LogicalResult ReifyBoundOp::verify() { return success(); } +::mlir::ValueBoundsConstraintSet::Variable ReifyBoundOp::getVariable() { + if (getDim().has_value()) + return ValueBoundsConstraintSet::Variable(getVar(), *getDim()); + return ValueBoundsConstraintSet::Variable(getVar()); +} + ::mlir::ValueBoundsConstraintSet::ComparisonOperator CompareOp::getComparisonOperator() { if (getCmp() == "EQ") @@ -564,6 +570,37 @@ CompareOp::getComparisonOperator() { llvm_unreachable("invalid comparison operator"); } +::mlir::ValueBoundsConstraintSet::Variable CompareOp::getLhs() { + if (!getLhsMap()) + return ValueBoundsConstraintSet::Variable(getVarOperands()[0]); + SmallVector mapOperands( + getVarOperands().slice(0, getLhsMap()->getNumInputs())); + return ValueBoundsConstraintSet::Variable(*getLhsMap(), mapOperands); +} + +::mlir::ValueBoundsConstraintSet::Variable CompareOp::getRhs() { + int64_t rhsOperandsBegin = getLhsMap() ? getLhsMap()->getNumInputs() : 1; + if (!getRhsMap()) + return ValueBoundsConstraintSet::Variable( + getVarOperands()[rhsOperandsBegin]); + SmallVector mapOperands( + getVarOperands().slice(rhsOperandsBegin, getRhsMap()->getNumInputs())); + return ValueBoundsConstraintSet::Variable(*getRhsMap(), mapOperands); +} + +LogicalResult CompareOp::verify() { + if (getCompose() && (getLhsMap() || getRhsMap())) + return emitOpError( + "'compose' not supported when 'lhs_map' or 'rhs_map' is present"); + int64_t expectedNumOperands = getLhsMap() ? getLhsMap()->getNumInputs() : 1; + expectedNumOperands += getRhsMap() ? getRhsMap()->getNumInputs() : 1; + if (getVarOperands().size() != expectedNumOperands) + return emitOpError("expected ") + << expectedNumOperands << " operands, but got " + << getVarOperands().size(); + return success(); +} + //===----------------------------------------------------------------------===// // Test removing op with inner ops. //===----------------------------------------------------------------------===// diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index ebf158b8bb8203..b641b3da719c78 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -2207,6 +2207,7 @@ def ReifyBoundOp : TEST_Op<"reify_bound", [Pure]> { let extraClassDeclaration = [{ ::mlir::presburger::BoundType getBoundType(); + ::mlir::ValueBoundsConstraintSet::Variable getVariable(); }]; let hasVerifier = 1; @@ -2217,18 +2218,29 @@ def CompareOp : TEST_Op<"compare"> { Compare `lhs` and `rhs`. A remark is emitted which indicates whether the specified comparison operator was proven to hold. The remark also indicates whether the opposite comparison operator was proven to hold. + + `var_operands` must have exactly two operands: one for the LHS operand and + one for the RHS operand. If `lhs_map` is specified, as many operands as + `lhs_map` has inputs are expected instead of the first operand. If `rhs_map` + is specified, as many operands as `rhs_map` has inputs are expected instead + of the second operand. }]; - let arguments = (ins Index:$lhs, - Index:$rhs, + let arguments = (ins Variadic:$var_operands, DefaultValuedAttr:$cmp, + OptionalAttr:$lhs_map, + OptionalAttr:$rhs_map, UnitAttr:$compose); let results = (outs); let extraClassDeclaration = [{ ::mlir::ValueBoundsConstraintSet::ComparisonOperator getComparisonOperator(); + ::mlir::ValueBoundsConstraintSet::Variable getLhs(); + ::mlir::ValueBoundsConstraintSet::Variable getRhs(); }]; + + let hasVerifier = 1; } //===----------------------------------------------------------------------===// From 5a46123ddf62900d3dc73330f699c73038645198 Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Tue, 16 Apr 2024 11:01:03 +0200 Subject: [PATCH 052/300] Fix missing dtor in function calls accepting trivial ABI structs (#88751) Fixes https://github.com/llvm/llvm-project/issues/88478 Promoting the `EHCleanup` to `NormalAndEHCleanup` in `EmitCallArgs` surfaced another bug with deactivation of normal cleanups. Here we missed emitting CPP scope ends for deactivated normal cleanups. This patch also fixes that bug. We missed emitting CPP scope ends because we remove the `fallthrough` (clears the insertion point) before deactivating normal cleanups. This is to make the emitted "normal" cleanup code unreachable. But we still need to emit CPP scope ends in the original basic block even for a deactivated normal cleanup. (This worked correctly before we did not remove `fallthrough` for `EHCleanup`s). --- clang/lib/CodeGen/CGCall.cpp | 13 ++++--- clang/lib/CodeGen/CGCleanup.cpp | 37 ++++++++++++------- clang/lib/CodeGen/CodeGenFunction.h | 3 +- .../CodeGenCXX/control-flow-in-stmt-expr.cpp | 16 ++++++++ 4 files changed, 48 insertions(+), 21 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 7a0bc6fa77b889..0c860a3ccbd2f0 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -4694,11 +4694,11 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E, AggValueSlot Slot = args.isUsingInAlloca() ? createPlaceholderSlot(*this, type) : CreateAggTemp(type, "agg.tmp"); - bool DestroyedInCallee = true, NeedsEHCleanup = true; + bool DestroyedInCallee = true, NeedsCleanup = true; if (const auto *RD = type->getAsCXXRecordDecl()) DestroyedInCallee = RD->hasNonTrivialDestructor(); else - NeedsEHCleanup = needsEHCleanup(type.isDestructedType()); + NeedsCleanup = type.isDestructedType(); if (DestroyedInCallee) Slot.setExternallyDestructed(); @@ -4707,14 +4707,15 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E, RValue RV = Slot.asRValue(); args.add(RV, type); - if (DestroyedInCallee && NeedsEHCleanup) { + if (DestroyedInCallee && NeedsCleanup) { // Create a no-op GEP between the placeholder and the cleanup so we can // RAUW it successfully. It also serves as a marker of the first // instruction where the cleanup is active. - pushFullExprCleanup(EHCleanup, Slot.getAddress(), - type); + pushFullExprCleanup(NormalAndEHCleanup, + Slot.getAddress(), type); // This unreachable is a temporary marker which will be removed later. - llvm::Instruction *IsActive = Builder.CreateUnreachable(); + llvm::Instruction *IsActive = + Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy)); args.addArgCleanupDeactivation(EHStack.stable_begin(), IsActive); } return; diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp index 5bf48bc22a5495..8683f19d9da28e 100644 --- a/clang/lib/CodeGen/CGCleanup.cpp +++ b/clang/lib/CodeGen/CGCleanup.cpp @@ -634,12 +634,19 @@ static void destroyOptimisticNormalEntry(CodeGenFunction &CGF, /// Pops a cleanup block. If the block includes a normal cleanup, the /// current insertion point is threaded through the cleanup, as are /// any branch fixups on the cleanup. -void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) { +void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, + bool ForDeactivation) { assert(!EHStack.empty() && "cleanup stack is empty!"); assert(isa(*EHStack.begin()) && "top not a cleanup!"); EHCleanupScope &Scope = cast(*EHStack.begin()); assert(Scope.getFixupDepth() <= EHStack.getNumBranchFixups()); + // If we are deactivating a normal cleanup, we need to pretend that the + // fallthrough is unreachable. We restore this IP before returning. + CGBuilderTy::InsertPoint NormalDeactivateOrigIP; + if (ForDeactivation && (Scope.isNormalCleanup() || !getLangOpts().EHAsynch)) { + NormalDeactivateOrigIP = Builder.saveAndClearIP(); + } // Remember activation information. bool IsActive = Scope.isActive(); Address NormalActiveFlag = @@ -729,6 +736,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) { EHStack.popCleanup(); // safe because there are no fixups assert(EHStack.getNumBranchFixups() == 0 || EHStack.hasNormalCleanups()); + if (NormalDeactivateOrigIP.isSet()) + Builder.restoreIP(NormalDeactivateOrigIP); return; } @@ -765,9 +774,16 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) { if (!RequiresNormalCleanup) { // Mark CPP scope end for passed-by-value Arg temp // per Windows ABI which is "normally" Cleanup in callee - if (IsEHa && getInvokeDest() && Builder.GetInsertBlock()) { - if (Personality.isMSVCXXPersonality()) + if (IsEHa && getInvokeDest()) { + // If we are deactivating a normal cleanup then we don't have a + // fallthrough. Restore original IP to emit CPP scope ends in the correct + // block. + if (NormalDeactivateOrigIP.isSet()) + Builder.restoreIP(NormalDeactivateOrigIP); + if (Personality.isMSVCXXPersonality() && Builder.GetInsertBlock()) EmitSehCppScopeEnd(); + if (NormalDeactivateOrigIP.isSet()) + NormalDeactivateOrigIP = Builder.saveAndClearIP(); } destroyOptimisticNormalEntry(*this, Scope); Scope.MarkEmitted(); @@ -992,6 +1008,8 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) { } } + if (NormalDeactivateOrigIP.isSet()) + Builder.restoreIP(NormalDeactivateOrigIP); assert(EHStack.hasNormalCleanups() || EHStack.getNumBranchFixups() == 0); // Emit the EH cleanup if required. @@ -1281,17 +1299,8 @@ void CodeGenFunction::DeactivateCleanupBlock(EHScopeStack::stable_iterator C, // to the current RunCleanupsScope. if (C == EHStack.stable_begin() && CurrentCleanupScopeDepth.strictlyEncloses(C)) { - // Per comment below, checking EHAsynch is not really necessary - // it's there to assure zero-impact w/o EHAsynch option - if (!Scope.isNormalCleanup() && getLangOpts().EHAsynch) { - PopCleanupBlock(); - } else { - // If it's a normal cleanup, we need to pretend that the - // fallthrough is unreachable. - CGBuilderTy::InsertPoint SavedIP = Builder.saveAndClearIP(); - PopCleanupBlock(); - Builder.restoreIP(SavedIP); - } + PopCleanupBlock(/*FallthroughIsBranchThrough=*/false, + /*ForDeactivation=*/true); return; } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index c49e9fd00c8d3e..d99188671f1f60 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -957,7 +957,8 @@ class CodeGenFunction : public CodeGenTypeCache { /// PopCleanupBlock - Will pop the cleanup entry on the stack and /// process all branch fixups. - void PopCleanupBlock(bool FallThroughIsBranchThrough = false); + void PopCleanupBlock(bool FallThroughIsBranchThrough = false, + bool ForDeactivation = false); /// DeactivateCleanupBlock - Deactivates the given cleanup block. /// The block cannot be reactivated. Pops it if it's the top of the diff --git a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp index 95deee8bb1f1f2..0a51b0e4121c33 100644 --- a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp +++ b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp @@ -391,3 +391,19 @@ void ArrayInitWithContinue() { })}; } } + +struct [[clang::trivial_abi]] HasTrivialABI { + HasTrivialABI(); + ~HasTrivialABI(); +}; +void AcceptTrivialABI(HasTrivialABI, int); +void TrivialABI() { + // CHECK-LABEL: define dso_local void @_Z10TrivialABIv() + AcceptTrivialABI(HasTrivialABI(), ({ + if (foo()) return; + // CHECK: if.then: + // CHECK-NEXT: call void @_ZN13HasTrivialABID1Ev + // CHECK-NEXT: br label %return + 0; + })); +} From 9141e1c24f87e5735bc4178a018eba4bdf2750aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Fri, 12 Apr 2024 10:43:14 +0200 Subject: [PATCH 053/300] [clang][Interp] Gracefully handle bitcasts to non-primitive types We were calling classfiyPrim() instead of classify(). --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 2 +- clang/test/AST/Interp/vectors.cpp | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 01ec31e4077f70..5866228663dca2 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -262,7 +262,7 @@ bool ByteCodeExprGen::VisitCastExpr(const CastExpr *CE) { return this->discard(SubExpr); std::optional FromT = classify(SubExpr->getType()); - std::optional ToT = classifyPrim(CE->getType()); + std::optional ToT = classify(CE->getType()); if (!FromT || !ToT) return false; diff --git a/clang/test/AST/Interp/vectors.cpp b/clang/test/AST/Interp/vectors.cpp index 8afef3c897bff7..fb5787a9eda9a9 100644 --- a/clang/test/AST/Interp/vectors.cpp +++ b/clang/test/AST/Interp/vectors.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s // RUN: %clang_cc1 -verify=ref,both %s -// both-no-diagnostics +// ref-no-diagnostics typedef int __attribute__((vector_size(16))) VI4; constexpr VI4 A = {1,2,3,4}; @@ -20,3 +20,9 @@ namespace Vector { } constexpr auto v2 = g(4); } + +/// FIXME: We need to support BitCasts between vector types. +namespace { + typedef float __attribute__((vector_size(16))) VI42; + constexpr VI42 A2 = A; // expected-error {{must be initialized by a constant expression}} +} From 4fc0a99b8f220b6b41648da491bcc81a067f1600 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 16 Apr 2024 10:14:15 +0100 Subject: [PATCH 054/300] [AMDGPU] Fix implicit operands of VOPD cndmask instructions (#87788) --- llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp | 1 + llvm/test/CodeGen/AMDGPU/vopd-combine.mir | 32 +++++++++++------------ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp index 05e10a95b157c9..1dda1b89b2d36c 100644 --- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -101,6 +101,7 @@ class GCNCreateVOPD : public MachineFunctionPass { } } + SII->fixImplicitOperands(*VOPDInst); for (auto CompIdx : VOPD::COMPONENTS) VOPDInst.copyImplicitOps(*MI[CompIdx]); diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir index 63bef40c34742f..b8ac50c3aeb5e8 100644 --- a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir +++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir @@ -160,7 +160,7 @@ body: | ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF ; PAIR-GFX11-NEXT: $sgpr20 = IMPLICIT_DEF ; PAIR-GFX11-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo ; PAIR-GFX11-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo ; PAIR-GFX11-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec ; PAIR-GFX11-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo @@ -174,7 +174,7 @@ body: | ; PAIR-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF ; PAIR-GFX12-NEXT: $sgpr20 = IMPLICIT_DEF ; PAIR-GFX12-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec - ; PAIR-GFX12-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx12 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX12-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx12 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo ; PAIR-GFX12-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo ; PAIR-GFX12-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec ; PAIR-GFX12-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo @@ -458,9 +458,9 @@ body: | ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX11-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo ; PAIR-GFX11-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo ; PAIR-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec ; PAIR-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec @@ -476,9 +476,9 @@ body: | ; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; PAIR-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec ; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX12-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX12-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo ; PAIR-GFX12-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX12-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX12-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo ; PAIR-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo ; PAIR-GFX12-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec ; PAIR-GFX12-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec @@ -559,12 +559,12 @@ body: | ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec ; PAIR-GFX11-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec ; PAIR-GFX11-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; PAIR-GFX11-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo @@ -586,12 +586,12 @@ body: | ; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; PAIR-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec ; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX12-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX12-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx12 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX12-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX12-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx12 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec ; PAIR-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX12-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec - ; PAIR-GFX12-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec - ; PAIR-GFX12-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX12-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX12-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX12-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec ; PAIR-GFX12-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec ; PAIR-GFX12-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; PAIR-GFX12-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo From f4f772ceef379bd434d266b6e0d2bbdf796f81cb Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 16 Apr 2024 10:15:00 +0100 Subject: [PATCH 055/300] [AMDGPU] Stop reserving $vcc_hi in wave32 mode (#87783) This gives us one extra SGPR to play with. The comment suggested that it could cause bugs, but I have tested it with Vulkan CTS with the default wave size for compute shaders set to 32 and did not find any problems. --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 7 -- llvm/test/CodeGen/AMDGPU/bf16.ll | 22 +++-- .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 20 ++--- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 22 ++--- llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 90 +++++++++---------- .../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir | 26 +++--- 6 files changed, 88 insertions(+), 99 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 245731ad5fc7c9..acb54fd10b90dc 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -612,13 +612,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Reserve null register - it shall never be allocated reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64); - // Disallow vcc_hi allocation in wave32. It may be allocated but most likely - // will result in bugs. - if (isWave32) { - Reserved.set(AMDGPU::VCC); - Reserved.set(AMDGPU::VCC_HI); - } - // Reserve SGPRs. // unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index bf4302c156d83d..4c9c34de7194ce 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -38342,12 +38342,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX10-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX10-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_and_b32_e32 v5, 1, v5 ; GFX10-NEXT: v_and_b32_e32 v7, 1, v7 ; GFX10-NEXT: v_and_b32_e32 v9, 1, v9 @@ -38366,7 +38365,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0 -; GFX10-NEXT: v_writelane_b32 v40, s35, 3 +; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27 ; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25 ; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23 @@ -38377,10 +38376,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13 ; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11 ; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7 -; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s35, 1, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9 ; GFX10-NEXT: s_waitcnt vmcnt(32) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31 ; GFX10-NEXT: s_waitcnt vmcnt(31) @@ -38460,10 +38459,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28 ; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s31 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, s30 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s34 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s35 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi +; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34 ; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4 ; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100 ; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100 @@ -38481,7 +38480,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100 ; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100 ; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100 -; GFX10-NEXT: v_readlane_b32 s35, v40, 3 ; GFX10-NEXT: v_readlane_b32 s34, v40, 2 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index ec3c08ec795235..da64c379672ef7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -1259,17 +1259,17 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind { ; GFX10SELDAG-LABEL: isnan_v4f16: ; GFX10SELDAG: ; %bb.0: ; GFX10SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10SELDAG-NEXT: v_mov_b32_e32 v2, 3 -; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v0, 3 -; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD -; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s5, v0, v2 src0_sel:WORD_1 src1_sel:DWORD +; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v0, 3 +; GFX10SELDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4 +; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v1, 3 +; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v0, v3 src0_sel:WORD_1 src1_sel:DWORD +; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v5 +; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v3 src0_sel:WORD_1 src1_sel:DWORD +; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v4 ; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v4 -; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5 -; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v1, 3 -; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v5 -; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 ; GFX10SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10GLISEL-LABEL: isnan_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index ab6a9dcf71acef..a87fa8bf36d9e7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -7404,35 +7404,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31 ; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35 ; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000 -; GFX12-NEXT: s_lshr_b32 s12, s0, 16 -; GFX12-NEXT: s_mov_b32 s14, s1 -; GFX12-NEXT: s_lshr_b32 s16, s1, 16 -; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000 ; GFX12-NEXT: s_lshr_b32 s2, s2, 16 ; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23 ; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25 +; GFX12-NEXT: s_mov_b32 s12, s1 +; GFX12-NEXT: s_lshr_b32 s14, s1, 16 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27 ; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000 +; GFX12-NEXT: s_lshr_b32 s0, s0, 16 ; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 ; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21 ; GFX12-NEXT: v_mov_b32_e32 v18, s20 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64 -; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16 ; GFX12-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2 -; GFX12-NEXT: v_dual_mov_b32 v9, s15 :: v_dual_mov_b32 v8, s14 -; GFX12-NEXT: v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16 +; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12 +; GFX12-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v10, s14 ; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10 -; GFX12-NEXT: v_dual_mov_b32 v23, s13 :: v_dual_mov_b32 v22, s12 +; GFX12-NEXT: v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index 952827b8cd0e71..889755c23bbc72 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -8808,73 +8808,73 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_lshrrev_b16 v2, 8, s6 ; GFX12-NEXT: v_lshrrev_b16 v4, 8, s5 ; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2 -; GFX12-NEXT: s_lshr_b32 s24, s7, 16 +; GFX12-NEXT: s_lshr_b32 s22, s7, 16 ; GFX12-NEXT: v_bfe_i32 v31, v1, 0, 8 -; GFX12-NEXT: s_lshr_b32 s42, s2, 24 -; GFX12-NEXT: s_mov_b32 s48, s7 +; GFX12-NEXT: s_lshr_b32 s40, s2, 24 +; GFX12-NEXT: s_mov_b32 s46, s7 ; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4 ; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1 -; GFX12-NEXT: s_lshr_b32 s26, s6, 16 -; GFX12-NEXT: s_lshr_b32 s44, s1, 16 +; GFX12-NEXT: s_lshr_b32 s24, s6, 16 +; GFX12-NEXT: s_lshr_b32 s42, s1, 16 ; GFX12-NEXT: s_ashr_i64 s[58:59], s[6:7], 56 -; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX12-NEXT: v_lshrrev_b16 v6, 8, s3 ; GFX12-NEXT: v_lshrrev_b16 v3, 8, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s24 -; GFX12-NEXT: s_lshr_b32 s28, s6, 24 -; GFX12-NEXT: s_lshr_b32 s30, s5, 16 -; GFX12-NEXT: s_lshr_b32 s40, s2, 16 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s22 +; GFX12-NEXT: s_lshr_b32 s26, s6, 24 +; GFX12-NEXT: s_lshr_b32 s28, s5, 16 +; GFX12-NEXT: s_lshr_b32 s38, s2, 16 ; GFX12-NEXT: v_bfe_i32 v11, v8, 0, 8 ; GFX12-NEXT: v_bfe_i32 v23, v4, 0, 8 ; GFX12-NEXT: v_bfe_i32 v27, v2, 0, 8 ; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31 -; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v34, s25 :: v_dual_mov_b32 v35, s58 -; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s26 -; GFX12-NEXT: v_dual_mov_b32 v56, s43 :: v_dual_mov_b32 v29, s48 -; GFX12-NEXT: v_mov_b32_e32 v30, s49 -; GFX12-NEXT: s_lshr_b32 s46, s0, 24 -; GFX12-NEXT: s_mov_b32 s50, s5 -; GFX12-NEXT: s_mov_b32 s52, s3 -; GFX12-NEXT: s_lshr_b32 s34, s4, 16 -; GFX12-NEXT: s_lshr_b32 s36, s4, 24 -; GFX12-NEXT: s_ashr_i64 s[22:23], s[2:3], 56 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s58 +; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s24 +; GFX12-NEXT: v_dual_mov_b32 v56, s41 :: v_dual_mov_b32 v29, s46 +; GFX12-NEXT: v_mov_b32_e32 v30, s47 +; GFX12-NEXT: s_lshr_b32 s44, s0, 24 +; GFX12-NEXT: s_mov_b32 s48, s5 +; GFX12-NEXT: s_mov_b32 s50, s3 +; GFX12-NEXT: s_lshr_b32 s30, s4, 16 +; GFX12-NEXT: s_lshr_b32 s34, s4, 24 +; GFX12-NEXT: s_ashr_i64 s[54:55], s[2:3], 56 ; GFX12-NEXT: s_ashr_i64 s[56:57], s[4:5], 56 ; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 8 ; GFX12-NEXT: v_bfe_i32 v19, v5, 0, 8 -; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX12-NEXT: s_lshr_b32 s38, s3, 16 -; GFX12-NEXT: s_mov_b32 s54, s1 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX12-NEXT: s_lshr_b32 s36, s3, 16 +; GFX12-NEXT: s_mov_b32 s52, s1 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[52:53], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[50:51], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[48:49], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x80000 ; GFX12-NEXT: s_lshr_b32 s20, s0, 16 ; GFX12-NEXT: s_ashr_i64 s[18:19], s[0:1], 56 ; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX12-NEXT: v_bfe_i32 v15, v6, 0, 8 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v38, s27 :: v_dual_mov_b32 v39, s28 -; GFX12-NEXT: v_dual_mov_b32 v40, s29 :: v_dual_mov_b32 v41, s30 -; GFX12-NEXT: v_dual_mov_b32 v42, s31 :: v_dual_mov_b32 v43, s56 -; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s34 -; GFX12-NEXT: v_dual_mov_b32 v52, s23 :: v_dual_mov_b32 v53, s40 -; GFX12-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s42 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v38, s25 :: v_dual_mov_b32 v39, s26 +; GFX12-NEXT: v_dual_mov_b32 v40, s27 :: v_dual_mov_b32 v41, s28 +; GFX12-NEXT: v_dual_mov_b32 v42, s29 :: v_dual_mov_b32 v43, s56 +; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s30 +; GFX12-NEXT: v_dual_mov_b32 v52, s55 :: v_dual_mov_b32 v53, s38 +; GFX12-NEXT: v_dual_mov_b32 v54, s39 :: v_dual_mov_b32 v55, s40 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[0:1], s[54:55], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[52:53], 0x80000 ; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27 ; GFX12-NEXT: global_store_b128 v0, v[33:36], s[8:9] offset:240 -; GFX12-NEXT: v_mov_b32_e32 v33, s44 +; GFX12-NEXT: v_mov_b32_e32 v33, s42 ; GFX12-NEXT: global_store_b128 v0, v[29:32], s[8:9] offset:224 ; GFX12-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17 ; GFX12-NEXT: v_dual_mov_b32 v32, s7 :: v_dual_mov_b32 v21, s4 @@ -8882,16 +8882,16 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v9, s12 ; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s0 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v46, s35 :: v_dual_mov_b32 v47, s36 -; GFX12-NEXT: v_dual_mov_b32 v48, s37 :: v_dual_mov_b32 v49, s38 -; GFX12-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s18 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v46, s31 :: v_dual_mov_b32 v47, s34 +; GFX12-NEXT: v_dual_mov_b32 v48, s35 :: v_dual_mov_b32 v49, s36 +; GFX12-NEXT: v_dual_mov_b32 v34, s43 :: v_dual_mov_b32 v35, s18 ; GFX12-NEXT: v_dual_mov_b32 v36, s19 :: v_dual_mov_b32 v29, s20 ; GFX12-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GFX12-NEXT: v_ashrrev_i32_e32 v20, 31, v19 ; GFX12-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v13, s2 ; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v1, s10 -; GFX12-NEXT: v_dual_mov_b32 v50, s39 :: v_dual_mov_b32 v51, s22 +; GFX12-NEXT: v_dual_mov_b32 v50, s37 :: v_dual_mov_b32 v51, s54 ; GFX12-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s6 ; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir index f8e7cb397b475e..8a5f75332557e6 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir @@ -28,18 +28,17 @@ body: | ; GCN-LABEL: name: test_main ; GCN: bb.0: ; GCN-NEXT: successors: %bb.1(0x80000000) - ; GCN-NEXT: liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr0 = COPY $sgpr33 + ; GCN-NEXT: $vcc_hi = frame-setup COPY $sgpr33 ; GCN-NEXT: $sgpr33 = frame-setup COPY $sgpr32 - ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5) ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5) ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5) ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5) - ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.74, addrspace 5) - ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 - ; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr5 + ; GCN-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; GCN-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc ; GCN-NEXT: renamable $vgpr2 = IMPLICIT_DEF ; GCN-NEXT: $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr3 @@ -116,18 +115,18 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: KILL implicit-def $vcc_lo, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: - ; GCN-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GCN-NEXT: liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 3 ; GCN-NEXT: $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 2 @@ -198,16 +197,15 @@ body: | ; GCN-NEXT: $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1 ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0 ; GCN-NEXT: KILL killed renamable $vgpr2 - ; GCN-NEXT: $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 4 - ; GCN-NEXT: $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5) ; GCN-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5) ; GCN-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5) ; GCN-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5) - ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.74, addrspace 5) - ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr1 + ; GCN-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5) + ; GCN-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; GCN-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc - ; GCN-NEXT: $sgpr33 = COPY $sgpr0 + ; GCN-NEXT: $sgpr33 = frame-destroy COPY $vcc_hi ; GCN-NEXT: S_ENDPGM 0 bb.0: liveins: $vgpr0 From 485d556d8c23b54da952e75c3cadc9db3050fd9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Sat, 13 Apr 2024 06:10:38 +0200 Subject: [PATCH 056/300] [clang][Interp][NFC] Add Block::dump() --- clang/lib/AST/Interp/Disasm.cpp | 16 ++++++++++++++++ clang/lib/AST/Interp/InterpBlock.h | 3 +++ 2 files changed, 19 insertions(+) diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp index 022b394e58e643..ebc4e4f195ba62 100644 --- a/clang/lib/AST/Interp/Disasm.cpp +++ b/clang/lib/AST/Interp/Disasm.cpp @@ -264,3 +264,19 @@ LLVM_DUMP_METHOD void Record::dump(llvm::raw_ostream &OS, unsigned Indentation, ++I; } } + +LLVM_DUMP_METHOD void Block::dump(llvm::raw_ostream &OS) const { + { + ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_BLUE, true}); + OS << "Block " << (void *)this << "\n"; + } + unsigned NPointers = 0; + for (const Pointer *P = Pointers; P; P = P->Next) { + ++NPointers; + } + OS << " Pointers: " << NPointers << "\n"; + OS << " Dead: " << IsDead << "\n"; + OS << " Static: " << IsStatic << "\n"; + OS << " Extern: " << IsExtern << "\n"; + OS << " Initialized: " << IsInitialized << "\n"; +} diff --git a/clang/lib/AST/Interp/InterpBlock.h b/clang/lib/AST/Interp/InterpBlock.h index 9db82567d2d5d6..6d5856fbd4ea19 100644 --- a/clang/lib/AST/Interp/InterpBlock.h +++ b/clang/lib/AST/Interp/InterpBlock.h @@ -118,6 +118,9 @@ class Block final { IsInitialized = false; } + void dump() const { dump(llvm::errs()); } + void dump(llvm::raw_ostream &OS) const; + protected: friend class Pointer; friend class DeadBlock; From 80fce05f2104d1c42db814276130536b014fcca2 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Tue, 16 Apr 2024 17:16:35 +0800 Subject: [PATCH 057/300] [InstCombine] Fold `minmax (X & NegPow2C, Y & NegPow2C) -> minmax(X, Y) & NegPow2C` (#88859) Alive2: https://alive2.llvm.org/ce/z/NFtkSX This optimization will be beneficial to jemalloc users. --- .../InstCombine/InstCombineCalls.cpp | 8 +- .../InstCombine/minmax-intrinsics.ll | 89 +++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index bae8579fc3650b..ba5db854647a42 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1774,6 +1774,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { if (Instruction *I = moveAddAfterMinMax(II, Builder)) return I; + // minmax (X & NegPow2C, Y & NegPow2C) --> minmax(X, Y) & NegPow2C + const APInt *RHSC; + if (match(I0, m_OneUse(m_And(m_Value(X), m_NegatedPower2(RHSC)))) && + match(I1, m_OneUse(m_And(m_Value(Y), m_SpecificInt(*RHSC))))) + return BinaryOperator::CreateAnd(Builder.CreateBinaryIntrinsic(IID, X, Y), + ConstantInt::get(II->getType(), *RHSC)); + // smax(X, -X) --> abs(X) // smin(X, -X) --> -abs(X) // umax(X, -X) --> -abs(X) @@ -1815,7 +1822,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { return NewMinMax; // Try to fold minmax with constant RHS based on range information - const APInt *RHSC; if (match(I1, m_APIntAllowUndef(RHSC))) { ICmpInst::Predicate Pred = ICmpInst::getNonStrictPredicate(MinMaxIntrinsic::getPredicate(IID)); diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll index ae2e115b1dd9a2..bd1a47bbfcc193 100644 --- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll @@ -2581,3 +2581,92 @@ entry: %val = call i8 @llvm.umin.i8(i8 %sub, i8 3) ret i8 %val } + +define i8 @test_umax_and(i8 %x, i8 %y) { +; CHECK-LABEL: @test_umax_and( +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.umax.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]]) +; CHECK-NEXT: [[RES1:%.*]] = and i8 [[RES]], -64 +; CHECK-NEXT: ret i8 [[RES1]] +; + %x1 = and i8 %x, -64 + %y1 = and i8 %y, -64 + %res = call i8 @llvm.umax.i8(i8 %x1, i8 %y1) + ret i8 %res +} + +define i8 @test_umin_and(i8 %x, i8 %y) { +; CHECK-LABEL: @test_umin_and( +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.umin.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]]) +; CHECK-NEXT: [[RES1:%.*]] = and i8 [[RES]], -64 +; CHECK-NEXT: ret i8 [[RES1]] +; + %x1 = and i8 %x, -64 + %y1 = and i8 %y, -64 + %res = call i8 @llvm.umin.i8(i8 %x1, i8 %y1) + ret i8 %res +} + +define i8 @test_smax_and(i8 %x, i8 %y) { +; CHECK-LABEL: @test_smax_and( +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.smax.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]]) +; CHECK-NEXT: [[RES1:%.*]] = and i8 [[RES]], -64 +; CHECK-NEXT: ret i8 [[RES1]] +; + %x1 = and i8 %x, -64 + %y1 = and i8 %y, -64 + %res = call i8 @llvm.smax.i8(i8 %x1, i8 %y1) + ret i8 %res +} + +define i8 @test_smin_and(i8 %x, i8 %y) { +; CHECK-LABEL: @test_smin_and( +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]]) +; CHECK-NEXT: [[RES1:%.*]] = and i8 [[RES]], -64 +; CHECK-NEXT: ret i8 [[RES1]] +; + %x1 = and i8 %x, -64 + %y1 = and i8 %y, -64 + %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1) + ret i8 %res +} + +define i8 @test_smin_and_mismatch(i8 %x, i8 %y) { +; CHECK-LABEL: @test_smin_and_mismatch( +; CHECK-NEXT: [[X1:%.*]] = and i8 [[X:%.*]], -64 +; CHECK-NEXT: [[Y1:%.*]] = and i8 [[Y:%.*]], -32 +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1]], i8 [[Y1]]) +; CHECK-NEXT: ret i8 [[RES]] +; + %x1 = and i8 %x, -64 + %y1 = and i8 %y, -32 + %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1) + ret i8 %res +} + +define i8 @test_smin_and_non_negated_pow2(i8 %x, i8 %y) { +; CHECK-LABEL: @test_smin_and_non_negated_pow2( +; CHECK-NEXT: [[X1:%.*]] = and i8 [[X:%.*]], 31 +; CHECK-NEXT: [[Y1:%.*]] = and i8 [[Y:%.*]], 31 +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1]], i8 [[Y1]]) +; CHECK-NEXT: ret i8 [[RES]] +; + %x1 = and i8 %x, 31 + %y1 = and i8 %y, 31 + %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1) + ret i8 %res +} + +define i8 @test_smin_and_multiuse(i8 %x, i8 %y) { +; CHECK-LABEL: @test_smin_and_multiuse( +; CHECK-NEXT: [[X1:%.*]] = and i8 [[X:%.*]], 31 +; CHECK-NEXT: [[Y1:%.*]] = and i8 [[Y:%.*]], 31 +; CHECK-NEXT: call void @use(i8 [[Y1]]) +; CHECK-NEXT: [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1]], i8 [[Y1]]) +; CHECK-NEXT: ret i8 [[RES]] +; + %x1 = and i8 %x, 31 + %y1 = and i8 %y, 31 + call void @use(i8 %y1) + %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1) + ret i8 %res +} From cce026bf8f7dcf5aa402a6da20f0d4da56aee8b5 Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Tue, 16 Apr 2024 17:19:45 +0800 Subject: [PATCH 058/300] [mlir][test] Fix -Wsign-compare in TestDialect.cpp (NFC) llvm-project/mlir/test/lib/Dialect/Test/TestDialect.cpp:597:31: error: comparison of integers of different signs: 'size_t' (aka 'unsigned long') and 'int64_t' (aka 'long') [-Werror,-Wsign-compare] if (getVarOperands().size() != expectedNumOperands) ~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~~~ 1 error generated. --- mlir/test/lib/Dialect/Test/TestDialect.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp index 36d7606fe1345b..a23ed89c4b04d1 100644 --- a/mlir/test/lib/Dialect/Test/TestDialect.cpp +++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp @@ -594,7 +594,7 @@ LogicalResult CompareOp::verify() { "'compose' not supported when 'lhs_map' or 'rhs_map' is present"); int64_t expectedNumOperands = getLhsMap() ? getLhsMap()->getNumInputs() : 1; expectedNumOperands += getRhsMap() ? getRhsMap()->getNumInputs() : 1; - if (getVarOperands().size() != expectedNumOperands) + if (getVarOperands().size() != size_t(expectedNumOperands)) return emitOpError("expected ") << expectedNumOperands << " operands, but got " << getVarOperands().size(); From 58b49cef1d772a922a433fd4a42e41db3f18d34b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Sat, 13 Apr 2024 15:06:22 +0200 Subject: [PATCH 059/300] [clang][Interp] Support __builtin_vectorelements --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 9 +++++++++ clang/test/AST/Interp/vectors.cpp | 2 ++ 2 files changed, 11 insertions(+) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 5866228663dca2..93059edc4622f8 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -1251,6 +1251,15 @@ bool ByteCodeExprGen::VisitUnaryExprOrTypeTraitExpr( return this->emitConst(Size.getQuantity(), E); } + if (Kind == UETT_VectorElements) { + if (const auto *VT = E->getTypeOfArgument()->getAs()) + return this->emitConst(VT->getNumElements(), E); + + // FIXME: Apparently we need to catch the fact that a sizeless vector type + // has been passed and diagnose that (at run time). + assert(E->getTypeOfArgument()->isSizelessVectorType()); + } + return false; } diff --git a/clang/test/AST/Interp/vectors.cpp b/clang/test/AST/Interp/vectors.cpp index fb5787a9eda9a9..6c5d916f51f563 100644 --- a/clang/test/AST/Interp/vectors.cpp +++ b/clang/test/AST/Interp/vectors.cpp @@ -13,12 +13,14 @@ namespace Vector { return VI4 { n * 3, n + 4, n - 5, n / 6 }; } constexpr auto v1 = f(10); + static_assert(__builtin_vectorelements(v1) == (16 / sizeof(int)), ""); typedef double __attribute__((vector_size(32))) VD4; constexpr VD4 g(int n) { return (VD4) { n / 2.0, n + 1.5, n - 5.4, n * 0.9 }; } constexpr auto v2 = g(4); + static_assert(__builtin_vectorelements(v2) == (32 / sizeof(double)), ""); } /// FIXME: We need to support BitCasts between vector types. From 70fe6ad535365de20c3a960dcfe97bd5cf68abb5 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Tue, 16 Apr 2024 10:24:23 +0100 Subject: [PATCH 060/300] [MLIR][OpenMP] Make omp.distribute into a loop wrapper (#87239) This patch updates the definition of `omp.distribute` to enforce the restrictions of a wrapper operation. --- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 22 +++++++++-- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 11 ++++++ mlir/test/Dialect/OpenMP/invalid.mlir | 34 ++++++++++++++++- mlir/test/Dialect/OpenMP/ops.mlir | 38 +++++++++++++++---- 4 files changed, 93 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 3abdbe3adfd0be..7e677c9839f602 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -840,7 +840,8 @@ def YieldOp : OpenMP_Op<"yield", //===----------------------------------------------------------------------===// def DistributeOp : OpenMP_Op<"distribute", [AttrSizedOperandSegments, DeclareOpInterfaceMethods, - RecursiveMemoryEffects]> { + RecursiveMemoryEffects, + SingleBlockImplicitTerminator<"TerminatorOp">]> { let summary = "distribute construct"; let description = [{ The distribute construct specifies that the iterations of one or more loops @@ -855,15 +856,28 @@ def DistributeOp : OpenMP_Op<"distribute", [AttrSizedOperandSegments, The distribute loop construct specifies that the iterations of the loop(s) will be executed in parallel by threads in the current context. These iterations are spread across threads that already exist in the enclosing - region. The lower and upper bounds specify a half-open range: the - range includes the lower bound but does not include the upper bound. If the - `inclusive` attribute is specified then the upper bound is also included. + region. + + The body region can contain a single block which must contain a single + operation and a terminator. The operation must be another compatible loop + wrapper or an `omp.loop_nest`. The `dist_schedule_static` attribute specifies the schedule for this loop, determining how the loop is distributed across the parallel threads. The optional `schedule_chunk` associated with this determines further controls this distribution. + ```mlir + omp.distribute { + omp.loop_nest (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) { + %a = load %arrA[%i1, %i2] : memref + %b = load %arrB[%i1, %i2] : memref + %sum = arith.addf %a, %b : f32 + store %sum, %arrC[%i1, %i2] : memref + omp.yield + } + } + ``` // TODO: private_var, firstprivate_var, lastprivate_var, collapse }]; let arguments = (ins diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 90b49b2528b790..365b3c4246bfd9 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1656,6 +1656,17 @@ LogicalResult DistributeOp::verify() { return emitError( "expected equal sizes for allocate and allocator variables"); + if (!isWrapper()) + return emitOpError() << "must be a loop wrapper"; + + if (LoopWrapperInterface nested = getNestedWrapper()) { + // Check for the allowed leaf constructs that may appear in a composite + // construct directly after DISTRIBUTE. + if (!isa(nested)) + return emitError() << "only supported nested wrappers are 'omp.parallel' " + "and 'omp.simdloop'"; + } + return success(); } diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index 88dca1b85ee5f7..f8739887214bfd 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -1866,7 +1866,16 @@ func.func @omp_target_depend(%data_var: memref) { // ----- -func.func @omp_distribute(%data_var : memref) -> () { +func.func @omp_distribute_schedule(%chunk_size : i32) -> () { + // expected-error @below {{op chunk size set without dist_schedule_static being present}} + "omp.distribute"(%chunk_size) <{operandSegmentSizes = array}> ({ + "omp.terminator"() : () -> () + }) : (i32) -> () +} + +// ----- + +func.func @omp_distribute_allocate(%data_var : memref) -> () { // expected-error @below {{expected equal sizes for allocate and allocator variables}} "omp.distribute"(%data_var) <{operandSegmentSizes = array}> ({ "omp.terminator"() : () -> () @@ -1875,6 +1884,29 @@ func.func @omp_distribute(%data_var : memref) -> () { // ----- +func.func @omp_distribute_wrapper() -> () { + // expected-error @below {{op must be a loop wrapper}} + "omp.distribute"() ({ + %0 = arith.constant 0 : i32 + "omp.terminator"() : () -> () + }) : () -> () +} + +// ----- + +func.func @omp_distribute_nested_wrapper(%data_var : memref) -> () { + // expected-error @below {{only supported nested wrappers are 'omp.parallel' and 'omp.simdloop'}} + "omp.distribute"() ({ + "omp.wsloop"() ({ + %0 = arith.constant 0 : i32 + "omp.terminator"() : () -> () + }) : () -> () + "omp.terminator"() : () -> () + }) : () -> () +} + +// ----- + omp.private {type = private} @x.privatizer : i32 alloc { ^bb0(%arg0: i32): %0 = arith.constant 0.0 : f32 diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index 851d44ad984eef..018b82e995d7d3 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -559,30 +559,54 @@ func.func @omp_simdloop_pretty_multiple(%lb1 : index, %ub1 : index, %step1 : ind } // CHECK-LABEL: omp_distribute -func.func @omp_distribute(%chunk_size : i32, %data_var : memref) -> () { +func.func @omp_distribute(%chunk_size : i32, %data_var : memref, %arg0 : i32) -> () { // CHECK: omp.distribute "omp.distribute" () ({ - omp.terminator + "omp.loop_nest" (%arg0, %arg0, %arg0) ({ + ^bb0(%iv: i32): + "omp.yield"() : () -> () + }) : (i32, i32, i32) -> () + "omp.terminator"() : () -> () }) {} : () -> () // CHECK: omp.distribute omp.distribute { - omp.terminator + omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) { + omp.yield + } } // CHECK: omp.distribute dist_schedule_static omp.distribute dist_schedule_static { - omp.terminator + omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) { + omp.yield + } } // CHECK: omp.distribute dist_schedule_static chunk_size(%{{.+}} : i32) omp.distribute dist_schedule_static chunk_size(%chunk_size : i32) { - omp.terminator + omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) { + omp.yield + } } // CHECK: omp.distribute order(concurrent) omp.distribute order(concurrent) { - omp.terminator + omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) { + omp.yield + } } // CHECK: omp.distribute allocate(%{{.+}} : memref -> %{{.+}} : memref) omp.distribute allocate(%data_var : memref -> %data_var : memref) { - omp.terminator + omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) { + omp.yield + } + } + // CHECK: omp.distribute + omp.distribute { + // TODO Remove induction variables from omp.simdloop. + omp.simdloop for (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) { + omp.loop_nest (%iv2) : i32 = (%arg0) to (%arg0) step (%arg0) { + omp.yield + } + omp.yield + } } return } From 668a58b8926473d731c41c55007f1fe4571ada86 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Tue, 16 Apr 2024 10:25:26 +0100 Subject: [PATCH 061/300] [flang][runtime] Add ACCESS library procedure (#88517) Re-land https://github.com/llvm/llvm-project/pull/88395 Two build-bots were broken by the old version: - https://lab.llvm.org/buildbot/#/builders/285/builds/245 - https://lab.llvm.org/buildbot/#/builders/21/builds/96988 The problem in both cases was that the compiler did not support `std::filesystem` (which I use in the unit test). I have removed the dependency upon std::filesystem because there isn't an easy way to add the right linker options so that this is supported correctly in all build environments [1] [1] https://gitlab.kitware.com/cmake/cmake/-/issues/17834 --- This is a GNU extension: https://gcc.gnu.org/onlinedocs/gfortran/ACCESS.html Used in SALMON: https://salmon-tddft.jp/download.html Unfortunately the intrinsic takes a file path to operate on so there isn't an easy way to make the test robust. The unit test expects to be able to create, set read write and execute permissions, and delete files called std::filesystem::temp_directory_path() / . The test will fail if a file already exists with that name. I have not implemented the intrinsic on Windows because this is wrapping a POSIX system call and Windows doesn't support all of the permission bits tested by the intrinsic. I don't have a Windows machine easily available to check if Gfortran implements this intrinsic on Windows. --- flang/docs/Intrinsics.md | 8 + flang/include/flang/Runtime/extensions.h | 7 + flang/runtime/extensions.cpp | 73 ++++ flang/unittests/Runtime/AccessTest.cpp | 422 +++++++++++++++++++++++ flang/unittests/Runtime/CMakeLists.txt | 1 + 5 files changed, 511 insertions(+) create mode 100644 flang/unittests/Runtime/AccessTest.cpp diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index ccb93e104dab65..848619cb65d909 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -657,6 +657,14 @@ CALL CO_REDUCE CALL CO_SUM ``` +### Inquiry Functions +ACCESS (GNU extension) is not supported on Windows. Otherwise: +``` +CHARACTER(LEN=*) :: path = 'path/to/file' +IF (ACCESS(path, 'rwx')) & + ... +``` + ## Non-standard intrinsics ### PGI ``` diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h index 7d0952206fc195..fef651f3b2eedb 100644 --- a/flang/include/flang/Runtime/extensions.h +++ b/flang/include/flang/Runtime/extensions.h @@ -44,5 +44,12 @@ std::int64_t RTNAME(Signal)(std::int64_t number, void (*handler)(int)); // GNU extension subroutine SLEEP(SECONDS) void RTNAME(Sleep)(std::int64_t seconds); +// GNU extension function ACCESS(NAME, MODE) +// TODO: not supported on Windows +#ifndef _WIN32 +std::int64_t FORTRAN_PROCEDURE_NAME(access)(const char *name, + std::int64_t nameLength, const char *mode, std::int64_t modeLength); +#endif + } // extern "C" #endif // FORTRAN_RUNTIME_EXTENSIONS_H_ diff --git a/flang/runtime/extensions.cpp b/flang/runtime/extensions.cpp index 3ac98000335d7d..12498b502ae1cf 100644 --- a/flang/runtime/extensions.cpp +++ b/flang/runtime/extensions.cpp @@ -17,6 +17,7 @@ #include "flang/Runtime/entry-names.h" #include "flang/Runtime/io-api.h" #include +#include #include #include #include @@ -138,5 +139,77 @@ void RTNAME(Sleep)(std::int64_t seconds) { std::this_thread::sleep_for(std::chrono::seconds(seconds)); } +// TODO: not supported on Windows +#ifndef _WIN32 +std::int64_t FORTRAN_PROCEDURE_NAME(access)(const char *name, + std::int64_t nameLength, const char *mode, std::int64_t modeLength) { + std::int64_t ret{-1}; + if (nameLength <= 0 || modeLength <= 0 || !name || !mode) { + return ret; + } + + // ensure name is null terminated + char *newName{nullptr}; + if (name[nameLength - 1] != '\0') { + newName = static_cast(std::malloc(nameLength + 1)); + std::memcpy(newName, name, nameLength); + newName[nameLength] = '\0'; + name = newName; + } + + // calculate mode + bool read{false}; + bool write{false}; + bool execute{false}; + bool exists{false}; + int imode{0}; + + for (std::int64_t i = 0; i < modeLength; ++i) { + switch (mode[i]) { + case 'r': + read = true; + break; + case 'w': + write = true; + break; + case 'x': + execute = true; + break; + case ' ': + exists = true; + break; + default: + // invalid mode + goto cleanup; + } + } + if (!read && !write && !execute && !exists) { + // invalid mode + goto cleanup; + } + + if (!read && !write && !execute) { + imode = F_OK; + } else { + if (read) { + imode |= R_OK; + } + if (write) { + imode |= W_OK; + } + if (execute) { + imode |= X_OK; + } + } + ret = access(name, imode); + +cleanup: + if (newName) { + free(newName); + } + return ret; +} +#endif + } // namespace Fortran::runtime } // extern "C" diff --git a/flang/unittests/Runtime/AccessTest.cpp b/flang/unittests/Runtime/AccessTest.cpp new file mode 100644 index 00000000000000..66f19f78c7cfb6 --- /dev/null +++ b/flang/unittests/Runtime/AccessTest.cpp @@ -0,0 +1,422 @@ +//===-- flang/unittests/Runtime/AccessTest.cpp ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// TODO: ACCESS is not yet implemented on Windows +#ifndef _WIN32 + +#include "CrashHandlerFixture.h" +#include "gtest/gtest.h" +#include "flang/Runtime/extensions.h" +#include "llvm/ADT/Twine.h" + +#include +#include +#include +#include + +namespace { + +struct AccessTests : public CrashHandlerFixture {}; + +struct AccessType { + bool read{false}; + bool write{false}; + bool execute{false}; + bool exists{false}; +}; + +} // namespace + +static std::string addPIDSuffix(const char *name) { + std::stringstream ss; + ss << name; + ss << '.'; + + ss << getpid(); + + return ss.str(); +} + +static bool exists(const std::string &path) { + return access(path.c_str(), F_OK) == 0; +} + +// Implementation of std::filesystem::temp_directory_path adapted from libcxx +// See llvm-project/libcxx/src/filesystem/operations.cpp +// Using std::filesystem is inconvenient because the required flags are not +// consistent accross compilers and CMake doesn't have built in support to +// determine the correct flags. +static const char *temp_directory_path() { + // TODO: Windows + const char *env_paths[] = {"TMPDIR", "TMP", "TEMP", "TEMPDIR"}; + const char *ret = nullptr; + + for (auto &ep : env_paths) { + if ((ret = getenv(ep))) { + break; + } + } + + if (ret == nullptr) { +#if defined(__ANDROID__) + ret = "/data/local/tmp"; +#else + ret = "/tmp"; +#endif + } + + assert(exists(ret)); + return ret; +} + +static std::string createTemporaryFile( + const char *name, const AccessType &accessType) { + std::string path = + (llvm::Twine{temp_directory_path()} + "/" + addPIDSuffix(name)).str(); + + // O_CREAT | O_EXCL enforces that this file is newly created by this call. + // This feels risky. If we don't have permission to create files in the + // temporary directory or if the files already exist, the test will fail. + // But we can't use std::tmpfile() because we need a path to the file and + // to control the filesystem permissions + mode_t mode{0}; + if (accessType.read) { + mode |= S_IRUSR; + } + if (accessType.write) { + mode |= S_IWUSR; + } + if (accessType.execute) { + mode |= S_IXUSR; + } + + int file = open(path.c_str(), O_CREAT | O_EXCL, mode); + if (file == -1) { + return {}; + } + + close(file); + + return path; +} + +static std::int64_t callAccess( + const std::string &path, const AccessType &accessType) { + const char *cpath{path.c_str()}; + std::int64_t pathlen = std::strlen(cpath); + + std::string mode; + if (accessType.read) { + mode += 'r'; + } + if (accessType.write) { + mode += 'w'; + } + if (accessType.execute) { + mode += 'x'; + } + if (accessType.exists) { + mode += ' '; + } + + const char *cmode = mode.c_str(); + std::int64_t modelen = std::strlen(cmode); + + return FORTRAN_PROCEDURE_NAME(access)(cpath, pathlen, cmode, modelen); +} + +TEST(AccessTests, TestExists) { + AccessType accessType; + accessType.exists = true; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_EQ(res, 0); +} + +TEST(AccessTests, TestNotExists) { + std::string nonExistant{addPIDSuffix(__func__)}; + ASSERT_FALSE(exists(nonExistant)); + + AccessType accessType; + accessType.exists = true; + std::int64_t res = callAccess(nonExistant, accessType); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestRead) { + AccessType accessType; + accessType.read = true; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_EQ(res, 0); +} + +TEST(AccessTests, TestNotRead) { + AccessType accessType; + accessType.read = false; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.read = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestWrite) { + AccessType accessType; + accessType.write = true; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_EQ(res, 0); +} + +TEST(AccessTests, TestNotWrite) { + AccessType accessType; + accessType.write = false; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.write = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestReadWrite) { + AccessType accessType; + accessType.read = true; + accessType.write = true; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_EQ(res, 0); +} + +TEST(AccessTests, TestNotReadWrite0) { + AccessType accessType; + accessType.read = false; + accessType.write = false; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.read = true; + accessType.write = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestNotReadWrite1) { + AccessType accessType; + accessType.read = true; + accessType.write = false; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.read = true; + accessType.write = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestNotReadWrite2) { + AccessType accessType; + accessType.read = false; + accessType.write = true; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.read = true; + accessType.write = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestExecute) { + AccessType accessType; + accessType.execute = true; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_EQ(res, 0); +} + +TEST(AccessTests, TestNotExecute) { + AccessType accessType; + accessType.execute = false; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.execute = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestRWX) { + AccessType accessType; + accessType.read = true; + accessType.write = true; + accessType.execute = true; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_EQ(res, 0); +} + +TEST(AccessTests, TestNotRWX0) { + AccessType accessType; + accessType.read = false; + accessType.write = false; + accessType.execute = false; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.read = true; + accessType.write = true; + accessType.execute = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestNotRWX1) { + AccessType accessType; + accessType.read = true; + accessType.write = false; + accessType.execute = false; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.read = true; + accessType.write = true; + accessType.execute = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestNotRWX2) { + AccessType accessType; + accessType.read = true; + accessType.write = true; + accessType.execute = false; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.read = true; + accessType.write = true; + accessType.execute = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestNotRWX3) { + AccessType accessType; + accessType.read = true; + accessType.write = false; + accessType.execute = true; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.read = true; + accessType.write = true; + accessType.execute = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +TEST(AccessTests, TestNotRWX4) { + AccessType accessType; + accessType.read = false; + accessType.write = true; + accessType.execute = true; + + std::string path = createTemporaryFile(__func__, accessType); + ASSERT_FALSE(path.empty()); + + accessType.read = true; + accessType.write = true; + accessType.execute = true; + std::int64_t res = callAccess(path, accessType); + + ASSERT_EQ(unlink(path.c_str()), 0); + + ASSERT_NE(res, 0); +} + +#endif // !_WIN32 diff --git a/flang/unittests/Runtime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt index 23f02aa751246b..f7caacad3a598f 100644 --- a/flang/unittests/Runtime/CMakeLists.txt +++ b/flang/unittests/Runtime/CMakeLists.txt @@ -1,4 +1,5 @@ add_flang_unittest(FlangRuntimeTests + AccessTest.cpp Allocatable.cpp ArrayConstructor.cpp BufferTest.cpp From 76782e28869abf93716f72f195d55c28eaf263ed Mon Sep 17 00:00:00 2001 From: Kiran Chandramohan Date: Tue, 16 Apr 2024 10:29:26 +0100 Subject: [PATCH 062/300] [Flang][OpenMP] NFC: Remove old reduction lowering code (#88798) The old code was replaced by https://github.com/llvm/llvm-project/pull/80019. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 223 ------------------------------ 1 file changed, 223 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 3dcfe0fd775dc5..352ca66e8735b6 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -103,21 +103,6 @@ static fir::GlobalOp globalInitialization( return global; } -static mlir::Operation *getCompareFromReductionOp(mlir::Operation *reductionOp, - mlir::Value loadVal) { - for (mlir::Value reductionOperand : reductionOp->getOperands()) { - if (mlir::Operation *compareOp = reductionOperand.getDefiningOp()) { - if (compareOp->getOperand(0) == loadVal || - compareOp->getOperand(1) == loadVal) - assert((mlir::isa(compareOp) || - mlir::isa(compareOp)) && - "Expected comparison not found in reduction intrinsic"); - return compareOp; - } - } - return nullptr; -} - // Get the extended value for \p val by extracting additional variable // information from \p base. static fir::ExtendedValue getExtendedValue(fir::ExtendedValue base, @@ -237,213 +222,6 @@ createAndSetPrivatizedLoopVar(Fortran::lower::AbstractConverter &converter, return storeOp; } -static mlir::Operation * -findReductionChain(mlir::Value loadVal, mlir::Value *reductionVal = nullptr) { - for (mlir::OpOperand &loadOperand : loadVal.getUses()) { - if (mlir::Operation *reductionOp = loadOperand.getOwner()) { - if (auto convertOp = mlir::dyn_cast(reductionOp)) { - for (mlir::OpOperand &convertOperand : convertOp.getRes().getUses()) { - if (mlir::Operation *reductionOp = convertOperand.getOwner()) - return reductionOp; - } - } - for (mlir::OpOperand &reductionOperand : reductionOp->getUses()) { - if (auto store = - mlir::dyn_cast(reductionOperand.getOwner())) { - if (store.getMemref() == *reductionVal) { - store.erase(); - return reductionOp; - } - } - if (auto assign = - mlir::dyn_cast(reductionOperand.getOwner())) { - if (assign.getLhs() == *reductionVal) { - assign.erase(); - return reductionOp; - } - } - } - } - } - return nullptr; -} - -// for a logical operator 'op' reduction X = X op Y -// This function returns the operation responsible for converting Y from -// fir.logical<4> to i1 -static fir::ConvertOp getConvertFromReductionOp(mlir::Operation *reductionOp, - mlir::Value loadVal) { - for (mlir::Value reductionOperand : reductionOp->getOperands()) { - if (auto convertOp = - mlir::dyn_cast(reductionOperand.getDefiningOp())) { - if (convertOp.getOperand() == loadVal) - continue; - return convertOp; - } - } - return nullptr; -} - -static void updateReduction(mlir::Operation *op, - fir::FirOpBuilder &firOpBuilder, - mlir::Value loadVal, mlir::Value reductionVal, - fir::ConvertOp *convertOp = nullptr) { - mlir::OpBuilder::InsertPoint insertPtDel = firOpBuilder.saveInsertionPoint(); - firOpBuilder.setInsertionPoint(op); - - mlir::Value reductionOp; - if (convertOp) - reductionOp = convertOp->getOperand(); - else if (op->getOperand(0) == loadVal) - reductionOp = op->getOperand(1); - else - reductionOp = op->getOperand(0); - - firOpBuilder.create(op->getLoc(), reductionOp, - reductionVal); - firOpBuilder.restoreInsertionPoint(insertPtDel); -} - -static void removeStoreOp(mlir::Operation *reductionOp, mlir::Value symVal) { - for (mlir::Operation *reductionOpUse : reductionOp->getUsers()) { - if (auto convertReduction = - mlir::dyn_cast(reductionOpUse)) { - for (mlir::Operation *convertReductionUse : - convertReduction.getRes().getUsers()) { - if (auto storeOp = mlir::dyn_cast(convertReductionUse)) { - if (storeOp.getMemref() == symVal) - storeOp.erase(); - } - if (auto assignOp = - mlir::dyn_cast(convertReductionUse)) { - if (assignOp.getLhs() == symVal) - assignOp.erase(); - } - } - } - } -} - -// Generate an OpenMP reduction operation. -// TODO: Currently assumes it is either an integer addition/multiplication -// reduction, or a logical and reduction. Generalize this for various reduction -// operation types. -// TODO: Generate the reduction operation during lowering instead of creating -// and removing operations since this is not a robust approach. Also, removing -// ops in the builder (instead of a rewriter) is probably not the best approach. -static void -genOpenMPReduction(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - const Fortran::parser::OmpClauseList &clauseList) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - - List clauses{makeClauses(clauseList, semaCtx)}; - - for (const Clause &clause : clauses) { - if (const auto &reductionClause = - std::get_if(&clause.u)) { - const auto &redOperatorList{ - std::get( - reductionClause->t)}; - assert(redOperatorList.size() == 1 && "Expecting single operator"); - const auto &redOperator = redOperatorList.front(); - const auto &objects{std::get(reductionClause->t)}; - if (const auto *reductionOp = - std::get_if(&redOperator.u)) { - const auto &intrinsicOp{ - std::get( - reductionOp->u)}; - - switch (intrinsicOp) { - case clause::DefinedOperator::IntrinsicOperator::Add: - case clause::DefinedOperator::IntrinsicOperator::Multiply: - case clause::DefinedOperator::IntrinsicOperator::AND: - case clause::DefinedOperator::IntrinsicOperator::EQV: - case clause::DefinedOperator::IntrinsicOperator::OR: - case clause::DefinedOperator::IntrinsicOperator::NEQV: - break; - default: - continue; - } - for (const Object &object : objects) { - if (const Fortran::semantics::Symbol *symbol = object.id()) { - mlir::Value reductionVal = converter.getSymbolAddress(*symbol); - if (auto declOp = reductionVal.getDefiningOp()) - reductionVal = declOp.getBase(); - mlir::Type reductionType = - reductionVal.getType().cast().getEleTy(); - if (!reductionType.isa()) { - if (!reductionType.isIntOrIndexOrFloat()) - continue; - } - for (mlir::OpOperand &reductionValUse : reductionVal.getUses()) { - if (auto loadOp = - mlir::dyn_cast(reductionValUse.getOwner())) { - mlir::Value loadVal = loadOp.getRes(); - if (reductionType.isa()) { - mlir::Operation *reductionOp = findReductionChain(loadVal); - fir::ConvertOp convertOp = - getConvertFromReductionOp(reductionOp, loadVal); - updateReduction(reductionOp, firOpBuilder, loadVal, - reductionVal, &convertOp); - removeStoreOp(reductionOp, reductionVal); - } else if (mlir::Operation *reductionOp = - findReductionChain(loadVal, &reductionVal)) { - updateReduction(reductionOp, firOpBuilder, loadVal, - reductionVal); - } - } - } - } - } - } else if (const auto *reductionIntrinsic = - std::get_if(&redOperator.u)) { - if (!ReductionProcessor::supportedIntrinsicProcReduction( - *reductionIntrinsic)) - continue; - ReductionProcessor::ReductionIdentifier redId = - ReductionProcessor::getReductionType(*reductionIntrinsic); - for (const Object &object : objects) { - if (const Fortran::semantics::Symbol *symbol = object.id()) { - mlir::Value reductionVal = converter.getSymbolAddress(*symbol); - if (auto declOp = reductionVal.getDefiningOp()) - reductionVal = declOp.getBase(); - for (const mlir::OpOperand &reductionValUse : - reductionVal.getUses()) { - if (auto loadOp = - mlir::dyn_cast(reductionValUse.getOwner())) { - mlir::Value loadVal = loadOp.getRes(); - // Max is lowered as a compare -> select. - // Match the pattern here. - mlir::Operation *reductionOp = - findReductionChain(loadVal, &reductionVal); - if (reductionOp == nullptr) - continue; - - if (redId == ReductionProcessor::ReductionIdentifier::MAX || - redId == ReductionProcessor::ReductionIdentifier::MIN) { - assert(mlir::isa(reductionOp) && - "Selection Op not found in reduction intrinsic"); - mlir::Operation *compareOp = - getCompareFromReductionOp(reductionOp, loadVal); - updateReduction(compareOp, firOpBuilder, loadVal, - reductionVal); - } - if (redId == ReductionProcessor::ReductionIdentifier::IOR || - redId == ReductionProcessor::ReductionIdentifier::IEOR || - redId == ReductionProcessor::ReductionIdentifier::IAND) { - updateReduction(reductionOp, firOpBuilder, loadVal, - reductionVal); - } - } - } - } - } - } - } - } -} - struct OpWithBodyGenInfo { /// A type for a code-gen callback function. This takes as argument the op for /// which the code is being generated and returns the arguments of the op's @@ -2197,7 +1975,6 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, // 2.9.3.1 SIMD construct createSimdLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList, currentLocation); - genOpenMPReduction(converter, semaCtx, loopOpClauseList); } else { createWsloop(converter, semaCtx, eval, ompDirective, loopOpClauseList, endClauseList, currentLocation); From 61717c1aa1f08eb57839a21fb2d9004739022e0d Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Tue, 16 Apr 2024 10:33:54 +0100 Subject: [PATCH 063/300] [Verifier] Reject va_start in non-variadic function (#88809) A va_start intrinsic lowers to something derived from the variadic parameter to the function. If there is no such parameter, it can't lower meaningfully. Clang sema rejects the same with `error: 'va_start' used in function with fixed args`. Moves the existing lint warning into a verifier error. Updates the one lit test that had a va_start in a non-variadic function. --- llvm/lib/Analysis/Lint.cpp | 5 +---- llvm/lib/IR/Verifier.cpp | 5 +++++ llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll | 2 +- llvm/test/Other/lint.ll | 7 ------- llvm/test/Verifier/variadic.ll | 8 ++++++++ 5 files changed, 15 insertions(+), 12 deletions(-) create mode 100644 llvm/test/Verifier/variadic.ll diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 0694c2995dfcce..1ab856ac8830a9 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -350,10 +350,7 @@ void Lint::visitCallBase(CallBase &I) { } case Intrinsic::vastart: - Check(I.getParent()->getParent()->isVarArg(), - "Undefined behavior: va_start called in a non-varargs function", - &I); - + // vastart in non-varargs function is rejected by the verifier visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI), std::nullopt, nullptr, MemRef::Read | MemRef::Write); break; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 516d4a0515569b..4cd61e6e531bff 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5798,6 +5798,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { break; } + case Intrinsic::vastart: { + Check(Call.getFunction()->isVarArg(), + "va_start called in a non-varargs function"); + break; + } case Intrinsic::vector_reduce_and: case Intrinsic::vector_reduce_or: case Intrinsic::vector_reduce_xor: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll b/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll index bd576d0f70e9c1..8c6e01d934c2d8 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll @@ -3,7 +3,7 @@ declare void @llvm.va_start(ptr) -define void @test_va_start(ptr %list) { +define void @test_va_start(ptr %list, ...) { ; CHECK-LABEL: name: test_va_start ; CHECK: [[LIST:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK-IOS: G_VASTART [[LIST]](p0) :: (store (s64) into %ir.list, align 1) diff --git a/llvm/test/Other/lint.ll b/llvm/test/Other/lint.ll index 6b31b31a78c98a..6fd2d40cd2f298 100644 --- a/llvm/test/Other/lint.ll +++ b/llvm/test/Other/lint.ll @@ -124,13 +124,6 @@ define void @0() nounwind { ret void } -; CHECK: va_start called in a non-varargs function -declare void @llvm.va_start(ptr) -define void @not_vararg(ptr %p) nounwind { - call void @llvm.va_start(ptr %p) - ret void -} - ; CHECK: Undefined behavior: Branch to non-blockaddress define void @use_indbr() { indirectbr ptr @foo, [label %block] diff --git a/llvm/test/Verifier/variadic.ll b/llvm/test/Verifier/variadic.ll new file mode 100644 index 00000000000000..55e4a4da0a9203 --- /dev/null +++ b/llvm/test/Verifier/variadic.ll @@ -0,0 +1,8 @@ +; RUN: not opt -S -passes=verify 2>&1 < %s | FileCheck %s + +; CHECK: va_start called in a non-varargs function +declare void @llvm.va_start(ptr) +define void @not_vararg(ptr %p) nounwind { + call void @llvm.va_start(ptr %p) + ret void +} From 1ca6b4475c02e5d022ec6b35dbb65d0f11409a88 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Tue, 16 Apr 2024 12:39:57 +0300 Subject: [PATCH 064/300] [mlir][scf] `scf.while` uplifting: optimize op matching (#88813) Instead of iterating over potential induction var uses looking for suitable `arith.addi`, try to trace it back from yield argument. --- .../SCF/Transforms/UpliftWhileToFor.cpp | 36 ++++++++----------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp index fea2f659535bb4..7b4024b6861a72 100644 --- a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp @@ -101,38 +101,30 @@ FailureOr mlir::scf::upliftWhileToForLoop(RewriterBase &rewriter, Block *afterBody = loop.getAfterBody(); scf::YieldOp afterTerm = loop.getYieldOp(); - auto argNumber = inductionVar.getArgNumber(); - auto afterTermIndArg = afterTerm.getResults()[argNumber]; + unsigned argNumber = inductionVar.getArgNumber(); + Value afterTermIndArg = afterTerm.getResults()[argNumber]; - auto inductionVarAfter = afterBody->getArgument(argNumber); - - Value step; + Value inductionVarAfter = afterBody->getArgument(argNumber); // Find suitable `addi` op inside `after` block, one of the args must be an // Induction var passed from `before` block and second arg must be defined // outside of the loop and will be considered step value. // TODO: Add `subi` support? - for (auto &use : inductionVarAfter.getUses()) { - auto owner = dyn_cast(use.getOwner()); - if (!owner) - continue; - - auto other = - (inductionVarAfter == owner.getLhs() ? owner.getRhs() : owner.getLhs()); - if (!dom.properlyDominates(other, loop)) - continue; - - if (afterTermIndArg != owner.getResult()) - continue; + auto addOp = afterTermIndArg.getDefiningOp(); + if (!addOp) + return rewriter.notifyMatchFailure(loop, "Didn't found suitable 'addi' op"); - step = other; - break; + Value step; + if (addOp.getLhs() == inductionVarAfter) { + step = addOp.getRhs(); + } else if (addOp.getRhs() == inductionVarAfter) { + step = addOp.getLhs(); } - if (!step) - return rewriter.notifyMatchFailure(loop, "Didn't found suitable 'addi' op"); + if (!step || !dom.properlyDominates(step, loop)) + return rewriter.notifyMatchFailure(loop, "Invalid 'addi' form"); - auto lb = loop.getInits()[argNumber]; + Value lb = loop.getInits()[argNumber]; assert(lb.getType().isIntOrIndex()); assert(lb.getType() == ub.getType()); From aae08f4f8ef7bc9c35d263ed974679130c49f5fc Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Tue, 16 Apr 2024 10:40:46 +0100 Subject: [PATCH 065/300] [MLIR][OpenMP] Make omp.taskloop into a loop wrapper (#87253) This patch updates the definition of `omp.taskloop` to enforce the restrictions of a wrapper operation. --- .../Dialect/OpenMP/OpenMPClauseOperands.h | 9 +- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 39 ++- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 15 +- mlir/test/Dialect/OpenMP/invalid.mlir | 92 +++++--- mlir/test/Dialect/OpenMP/ops.mlir | 222 ++++++++++-------- 5 files changed, 218 insertions(+), 159 deletions(-) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h index 304a9740d91ed3..27a766aceb3160 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h @@ -284,11 +284,10 @@ using TaskgroupClauseOps = detail::Clauses; using TaskloopClauseOps = - detail::Clauses; + detail::Clauses; using TaskwaitClauseOps = detail::Clauses; diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 7e677c9839f602..82be7ad31a158f 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -1030,10 +1030,10 @@ def TaskOp : OpenMP_Op<"task", [AttrSizedOperandSegments, } def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments, - AutomaticAllocationScope, RecursiveMemoryEffects, - AllTypesMatch<["lowerBound", "upperBound", "step"]>, + AutomaticAllocationScope, DeclareOpInterfaceMethods, - ReductionClauseInterface]> { + RecursiveMemoryEffects, ReductionClauseInterface, + SingleBlockImplicitTerminator<"TerminatorOp">]> { let summary = "taskloop construct"; let description = [{ The taskloop construct specifies that the iterations of one or more @@ -1041,21 +1041,19 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments, iterations are distributed across tasks generated by the construct and scheduled to be executed. - The `lowerBound` and `upperBound` specify a half-open range: the range - includes the lower bound but does not include the upper bound. If the - `inclusive` attribute is specified then the upper bound is also included. - The `step` specifies the loop step. - - The body region can contain any number of blocks. + The body region can contain a single block which must contain a single + operation and a terminator. The operation must be another compatible loop + wrapper or an `omp.loop_nest`. ``` - omp.taskloop - for (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) { - %a = load %arrA[%i1, %i2] : memref - %b = load %arrB[%i1, %i2] : memref - %sum = arith.addf %a, %b : f32 - store %sum, %arrC[%i1, %i2] : memref - omp.terminator + omp.taskloop { + omp.loop_nest (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) { + %a = load %arrA[%i1, %i2] : memref + %b = load %arrB[%i1, %i2] : memref + %sum = arith.addf %a, %b : f32 + store %sum, %arrC[%i1, %i2] : memref + omp.yield + } } ``` @@ -1132,11 +1130,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments, created. }]; - let arguments = (ins Variadic:$lowerBound, - Variadic:$upperBound, - Variadic:$step, - UnitAttr:$inclusive, - Optional:$if_expr, + let arguments = (ins Optional:$if_expr, Optional:$final_expr, UnitAttr:$untied, UnitAttr:$mergeable, @@ -1179,8 +1173,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments, |`grain_size` `(` $grain_size `:` type($grain_size) `)` |`num_tasks` `(` $num_tasks `:` type($num_tasks) `)` |`nogroup` $nogroup - ) `for` custom($region, $lowerBound, $upperBound, $step, - type($step), $inclusive) attr-dict + ) $region attr-dict }]; let extraClassDeclaration = [{ diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 365b3c4246bfd9..e500d0fca741fb 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1829,9 +1829,8 @@ void TaskloopOp::build(OpBuilder &builder, OperationState &state, MLIRContext *ctx = builder.getContext(); // TODO Store clauses in op: reductionByRefAttr, privateVars, privatizers. TaskloopOp::build( - builder, state, clauses.loopLBVar, clauses.loopUBVar, clauses.loopStepVar, - clauses.loopInclusiveAttr, clauses.ifVar, clauses.finalVar, - clauses.untiedAttr, clauses.mergeableAttr, clauses.inReductionVars, + builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr, + clauses.mergeableAttr, clauses.inReductionVars, makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.reductionVars, makeArrayAttr(ctx, clauses.reductionDeclSymbols), clauses.priorityVar, clauses.allocateVars, clauses.allocatorVars, clauses.grainsizeVar, @@ -1870,6 +1869,16 @@ LogicalResult TaskloopOp::verify() { "the grainsize clause and num_tasks clause are mutually exclusive and " "may not appear on the same taskloop directive"); } + + if (!isWrapper()) + return emitOpError() << "must be a loop wrapper"; + + if (LoopWrapperInterface nested = getNestedWrapper()) { + // Check for the allowed leaf constructs that may appear in a composite + // construct directly after TASKLOOP. + if (!isa(nested)) + return emitError() << "only supported nested wrapper is 'omp.simdloop'"; + } return success(); } diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index f8739887214bfd..7f86a7f5b3182e 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -1580,10 +1580,11 @@ func.func @omp_cancellationpoint2() { func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { %testmemref = "test.memref"() : () -> (memref) // expected-error @below {{expected equal sizes for allocate and allocator variables}} - "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testmemref) ({ - ^bb0(%arg3: i32, %arg4: i32): - "omp.terminator"() : () -> () - }) {operandSegmentSizes = array} : (i32, i32, i32, i32, i32, i32, memref) -> () + "omp.taskloop"(%testmemref) ({ + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + omp.yield + } + }) {operandSegmentSizes = array} : (memref) -> () return } @@ -1593,10 +1594,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { %testf32 = "test.f32"() : () -> (!llvm.ptr) %testf32_2 = "test.f32"() : () -> (!llvm.ptr) // expected-error @below {{expected as many reduction symbol references as reduction variables}} - "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32, %testf32_2) ({ - ^bb0(%arg3: i32, %arg4: i32): - "omp.terminator"() : () -> () - }) {operandSegmentSizes = array, reductions = [@add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr, !llvm.ptr) -> () + "omp.taskloop"(%testf32, %testf32_2) ({ + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + omp.yield + } + }) {operandSegmentSizes = array, reductions = [@add_f32]} : (!llvm.ptr, !llvm.ptr) -> () return } @@ -1604,12 +1606,12 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { %testf32 = "test.f32"() : () -> (!llvm.ptr) - %testf32_2 = "test.f32"() : () -> (!llvm.ptr) // expected-error @below {{expected as many reduction symbol references as reduction variables}} - "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32) ({ - ^bb0(%arg3: i32, %arg4: i32): - "omp.terminator"() : () -> () - }) {operandSegmentSizes = array, reductions = [@add_f32, @add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr) -> () + "omp.taskloop"(%testf32) ({ + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + omp.yield + } + }) {operandSegmentSizes = array, reductions = [@add_f32, @add_f32]} : (!llvm.ptr) -> () return } @@ -1619,10 +1621,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { %testf32 = "test.f32"() : () -> (!llvm.ptr) %testf32_2 = "test.f32"() : () -> (!llvm.ptr) // expected-error @below {{expected as many reduction symbol references as reduction variables}} - "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32, %testf32_2) ({ - ^bb0(%arg3: i32, %arg4: i32): - "omp.terminator"() : () -> () - }) {in_reductions = [@add_f32], operandSegmentSizes = array} : (i32, i32, i32, i32, i32, i32, !llvm.ptr, !llvm.ptr) -> () + "omp.taskloop"(%testf32, %testf32_2) ({ + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + omp.yield + } + }) {in_reductions = [@add_f32], operandSegmentSizes = array} : (!llvm.ptr, !llvm.ptr) -> () return } @@ -1630,12 +1633,12 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { %testf32 = "test.f32"() : () -> (!llvm.ptr) - %testf32_2 = "test.f32"() : () -> (!llvm.ptr) // expected-error @below {{expected as many reduction symbol references as reduction variables}} - "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32_2) ({ - ^bb0(%arg3: i32, %arg4: i32): - "omp.terminator"() : () -> () - }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array} : (i32, i32, i32, i32, i32, i32, !llvm.ptr) -> () + "omp.taskloop"(%testf32) ({ + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + omp.yield + } + }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array} : (!llvm.ptr) -> () return } @@ -1657,9 +1660,10 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { %testf32 = "test.f32"() : () -> (!llvm.ptr) %testf32_2 = "test.f32"() : () -> (!llvm.ptr) // expected-error @below {{if a reduction clause is present on the taskloop directive, the nogroup clause must not be specified}} - omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) nogroup - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - omp.terminator + omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) nogroup { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + omp.yield + } } return } @@ -1681,9 +1685,10 @@ combiner { func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { %testf32 = "test.f32"() : () -> (!llvm.ptr) // expected-error @below {{the same list item cannot appear in both a reduction and an in_reduction clause}} - omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr) in_reduction(@add_f32 -> %testf32 : !llvm.ptr) - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - omp.terminator + omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr) in_reduction(@add_f32 -> %testf32 : !llvm.ptr) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + omp.yield + } } return } @@ -1693,8 +1698,20 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { %testi64 = "test.i64"() : () -> (i64) // expected-error @below {{the grainsize clause and num_tasks clause are mutually exclusive and may not appear on the same taskloop directive}} - omp.taskloop grain_size(%testi64: i64) num_tasks(%testi64: i64) - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + omp.taskloop grain_size(%testi64: i64) num_tasks(%testi64: i64) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + omp.yield + } + } + return +} + +// ----- + +func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { + // expected-error @below {{op must be a loop wrapper}} + omp.taskloop { + %0 = arith.constant 0 : i32 omp.terminator } return @@ -1702,6 +1719,21 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { // ----- +func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { + // expected-error @below {{only supported nested wrapper is 'omp.simdloop'}} + omp.taskloop { + omp.distribute { + omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.yield + } + omp.terminator + } + } + return +} + +// ----- + func.func @omp_threadprivate() { %1 = llvm.mlir.addressof @_QFsubEx : !llvm.ptr // expected-error @below {{op failed to verify that all of {sym_addr, tls_addr} have same type}} diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index 018b82e995d7d3..802e1795b3fffb 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -171,6 +171,23 @@ func.func @omp_loop_nest(%lb : index, %ub : index, %step : index) -> () { omp.yield } + // TODO Remove induction variables from omp.wsloop. + omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) { + // CHECK: omp.loop_nest + // CHECK-SAME: (%{{.*}}) : index = + // CHECK-SAME: (%{{.*}}) to (%{{.*}}) step (%{{.*}}) + "omp.loop_nest" (%lb, %ub, %step) ({ + ^bb0(%iv2: index): + // CHECK: test.op1 + "test.op1"(%lb) : (index) -> () + // CHECK: test.op2 + "test.op2"() : () -> () + // CHECK: omp.yield + omp.yield + }) : (index, index, index) -> () + omp.yield + } + return } @@ -209,6 +226,22 @@ func.func @omp_loop_nest_pretty(%lb : index, %ub : index, %step : index) -> () { omp.yield } + // TODO Remove induction variables from omp.wsloop. + omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) { + // CHECK: omp.loop_nest + // CHECK-SAME: (%{{.*}}) : index = + // CHECK-SAME: (%{{.*}}) to (%{{.*}}) step (%{{.*}}) + omp.loop_nest (%iv2) : index = (%lb) to (%ub) step (%step) { + // CHECK: test.op1 + "test.op1"(%lb) : (index) -> () + // CHECK: test.op2 + "test.op2"() : () -> () + // CHECK: omp.yield + omp.yield + } + omp.yield + } + return } @@ -2024,135 +2057,128 @@ func.func @omp_taskgroup_clauses() -> () { // CHECK-LABEL: @omp_taskloop func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () { - // CHECK: omp.taskloop for (%{{.+}}) : i32 = (%{{.+}}) to (%{{.+}}) step (%{{.+}}) { - omp.taskloop for (%i) : i32 = (%lb) to (%ub) step (%step) { - // CHECK: omp.terminator - omp.terminator - } - - // CHECK: omp.taskloop for (%{{.+}}) : i32 = (%{{.+}}) to (%{{.+}}) step (%{{.+}}) { - omp.taskloop for (%i) : i32 = (%lb) to (%ub) step (%step) { - // CHECK: test.op1 - "test.op1"(%lb) : (i32) -> () - // CHECK: test.op2 - "test.op2"() : () -> () - // CHECK: omp.terminator - omp.terminator - } - - // CHECK: omp.taskloop for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator - } - - // CHECK: omp.taskloop for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) inclusive step (%{{.+}}, %{{.+}}) { - omp.taskloop for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) inclusive step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop { + omp.taskloop { + omp.loop_nest (%i) : i32 = (%lb) to (%ub) step (%step) { + // CHECK: omp.yield + omp.yield + } } %testbool = "test.bool"() : () -> (i1) - // CHECK: omp.taskloop if(%{{[^)]+}}) - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop if(%testbool) - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop if(%{{[^)]+}}) { + omp.taskloop if(%testbool) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } - // CHECK: omp.taskloop final(%{{[^)]+}}) - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop final(%testbool) - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop final(%{{[^)]+}}) { + omp.taskloop final(%testbool) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } - // CHECK: omp.taskloop untied - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop untied - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop untied { + omp.taskloop untied { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } - // CHECK: omp.taskloop mergeable - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop mergeable - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop mergeable { + omp.taskloop mergeable { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } %testf32 = "test.f32"() : () -> (!llvm.ptr) %testf32_2 = "test.f32"() : () -> (!llvm.ptr) - // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) { + omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } - // CHECK: omp.taskloop reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) { + omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } - // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr) reduction(@add_f32 -> %{{.+}} : !llvm.ptr) - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr) reduction(@add_f32 -> %testf32_2 : !llvm.ptr) - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr) reduction(@add_f32 -> %{{.+}} : !llvm.ptr) { + omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr) reduction(@add_f32 -> %testf32_2 : !llvm.ptr) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } %testi32 = "test.i32"() : () -> (i32) - // CHECK: omp.taskloop priority(%{{[^:]+}}: i32) - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop priority(%testi32: i32) - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop priority(%{{[^:]+}}: i32) { + omp.taskloop priority(%testi32: i32) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } %testmemref = "test.memref"() : () -> (memref) - // CHECK: omp.taskloop allocate(%{{.+}} : memref -> %{{.+}} : memref) - omp.taskloop allocate(%testmemref : memref -> %testmemref : memref) - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop allocate(%{{.+}} : memref -> %{{.+}} : memref) { + omp.taskloop allocate(%testmemref : memref -> %testmemref : memref) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } %testi64 = "test.i64"() : () -> (i64) - // CHECK: omp.taskloop grain_size(%{{[^:]+}}: i64) - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop grain_size(%testi64: i64) - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop grain_size(%{{[^:]+}}: i64) { + omp.taskloop grain_size(%testi64: i64) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } - // CHECK: omp.taskloop num_tasks(%{{[^:]+}}: i64) - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop num_tasks(%testi64: i64) - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop num_tasks(%{{[^:]+}}: i64) { + omp.taskloop num_tasks(%testi64: i64) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } } - // CHECK: omp.taskloop nogroup - // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) { - omp.taskloop nogroup - for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { - // CHECK: omp.terminator - omp.terminator + // CHECK: omp.taskloop nogroup { + omp.taskloop nogroup { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } + } + + // CHECK: omp.taskloop { + omp.taskloop { + // TODO Remove induction variables from omp.simdloop. + omp.simdloop for (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { + // CHECK: omp.yield + omp.yield + } + // CHECK: omp.yield + omp.yield + } } // CHECK: return From ca4cf973279a3991248056a73bcb2bac8b37d035 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Mon, 15 Apr 2024 12:16:55 +0200 Subject: [PATCH 066/300] [clang][Interp][NFC] Fix Pointer::isZero() for block pointers We don't need to consider the offset here anymore since we now have proper integral pointers. --- clang/lib/AST/Interp/Pointer.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h index fcd00aac62f93e..b4475577b74625 100644 --- a/clang/lib/AST/Interp/Pointer.h +++ b/clang/lib/AST/Interp/Pointer.h @@ -241,13 +241,10 @@ class Pointer { /// Checks if the pointer is null. bool isZero() const { - if (Offset != 0) - return false; - if (isBlockPointer()) return asBlockPointer().Pointee == nullptr; assert(isIntegralPointer()); - return asIntPointer().Value == 0; + return asIntPointer().Value == 0 && Offset == 0; } /// Checks if the pointer is live. bool isLive() const { From 422bf13f336923da89055f8e70e49e7e9ced2c70 Mon Sep 17 00:00:00 2001 From: choikwa <5455710+choikwa@users.noreply.github.com> Date: Tue, 16 Apr 2024 06:04:37 -0400 Subject: [PATCH 067/300] =?UTF-8?q?[AMDGPU]=20In=20VectorLegalizer::Expand?= =?UTF-8?q?,=20if=20UnrollVectorOp=20returns=20Load,=20=E2=80=A6=20(#88475?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …return only Load since other output is chain. Added testcase that showed mismatched expected arity when Load and chain were returned as separate items after 003b58f65bdd5d9c7d0c1b355566c9ef430c0e7d --- .../SelectionDAG/LegalizeVectorOps.cpp | 10 +- llvm/test/CodeGen/AMDGPU/build_vector.ll | 97 +++++++++++++++++++ 2 files changed, 105 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 57a3f6a65e002c..7a9cfdf5c3fda9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1159,8 +1159,14 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { } SDValue Unrolled = DAG.UnrollVectorOp(Node); - for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I) - Results.push_back(Unrolled.getValue(I)); + if (Node->getNumValues() == 1) { + Results.push_back(Unrolled); + } else { + assert(Node->getNumValues() == Unrolled->getNumValues() && + "VectorLegalizer Expand returned wrong number of results!"); + for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I) + Results.push_back(Unrolled.getValue(I)); + } } SDValue VectorLegalizer::ExpandSELECT(SDNode *Node) { diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index 37412ac3aa5418..99755133f36d6a 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8,GFX678,ALL ; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10,GFX1011,ALL ; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX1011,ALL +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940,ALL ; ALL-LABEL: {{^}}build_vector2: ; R600: MOV @@ -96,3 +97,99 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32 store <2 x i16> %ins.1, ptr addrspace(1) %out ret void } + +; R600-LABEL: build_v2i32_from_v4i16_shuffle: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 0, @10, KC0[], KC1[] +; R600-NEXT: TEX 1 @6 +; R600-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_16 T1.X, T0.X, 48, #3 +; R600-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 +; R600-NEXT: ALU clause starting at 10: +; R600-NEXT: MOV * T0.X, 0.0, +; R600-NEXT: ALU clause starting at 11: +; R600-NEXT: LSHL * T0.Y, T1.X, literal.x, +; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; R600-NEXT: LSHL T0.X, T0.X, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; R600-NEXT: 16(2.242078e-44), 2(2.802597e-45) +; +; GFX6-LABEL: build_v2i32_from_v4i16_shuffle: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX8-LABEL: build_v2i32_from_v4i16_shuffle: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: s_lshl_b32 s0, s3, 16 +; GFX8-NEXT: s_lshl_b32 s1, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: build_v2i32_from_v4i16_shuffle: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: +; GFX940: ; %bb.0: ; %entry +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_lshl_b32 s3, s3, 16 +; GFX940-NEXT: s_lshl_b32 s2, s2, 16 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_endpgm +define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) { +entry: + %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> + %zextended = zext <2 x i16> %shuf to <2 x i32> + %shifted = shl <2 x i32> %zextended, + store <2 x i32> %shifted, ptr addrspace(1) %out + ret void +} From 4dd5180a2d43b088d7637c30c2654f3c01c46987 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Tue, 16 Apr 2024 11:08:25 +0100 Subject: [PATCH 068/300] [Flang][OpenMP][Lower] Split MLIR codegen for clauses and constructs (#86963) This patch performs several cleanups with the main purpose of normalizing the code patterns used to trigger codegen for MLIR OpenMP operations and making the processing of clauses and constructs independent. The following changes are made: - Clean up unused `directive` argument to `ClauseProcessor::processMap()`. - Move general helper functions in OpenMP.cpp to the appropriate section of the file. - Create `genClauses()` functions containing the clause processing code specific for the associated OpenMP construct. - Update `genOp()` functions to call the corresponding `genClauses()` function. - Sort calls to `ClauseProcessor::process()` alphabetically, to avoid inadvertently relying on some arbitrary order. Update some tests that broke due to the order change. - Normalize `genOMP()` functions so they all delegate the generation of MLIR to `genOp()` functions following the same pattern. - Only process `nowait` clause on `TARGET` constructs if not compiling for the target device. A later patch can move the calls to `genClauses()` out of `genOp()` functions and passing completed clause structures instead, in preparation to supporting composite constructs. That will make it possible to reuse clause processing for a given leaf construct when appearing alone or in a combined or composite construct, while controlling where the associated code is produced. --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 4 +- flang/lib/Lower/OpenMP/ClauseProcessor.h | 3 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 2136 +++++++++-------- flang/test/Lower/OpenMP/FIR/target.f90 | 2 +- flang/test/Lower/OpenMP/target.f90 | 2 +- .../use-device-ptr-to-use-device-addr.f90 | 4 +- 6 files changed, 1198 insertions(+), 953 deletions(-) diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index ae0d8bd37228df..4c51b61f6bf029 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -832,8 +832,8 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc, } bool ClauseProcessor::processMap( - mlir::Location currentLocation, const llvm::omp::Directive &directive, - Fortran::lower::StatementContext &stmtCtx, mlir::omp::MapClauseOps &result, + mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx, + mlir::omp::MapClauseOps &result, llvm::SmallVectorImpl *mapSyms, llvm::SmallVectorImpl *mapSymLocs, llvm::SmallVectorImpl *mapSymTypes) const { diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index aa2c14b61e7565..3f9701310ebaeb 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -114,8 +114,7 @@ class ClauseProcessor { // They may be used later on to create the block_arguments for some of the // target directives that require it. bool processMap( - mlir::Location currentLocation, const llvm::omp::Directive &directive, - Fortran::lower::StatementContext &stmtCtx, + mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx, mlir::omp::MapClauseOps &result, llvm::SmallVectorImpl *mapSyms = nullptr, diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 352ca66e8735b6..9b997522366621 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -222,6 +222,276 @@ createAndSetPrivatizedLoopVar(Fortran::lower::AbstractConverter &converter, return storeOp; } +// This helper function implements the functionality of "promoting" +// non-CPTR arguments of use_device_ptr to use_device_addr +// arguments (automagic conversion of use_device_ptr -> +// use_device_addr in these cases). The way we do so currently is +// through the shuffling of operands from the devicePtrOperands to +// deviceAddrOperands where neccesary and re-organizing the types, +// locations and symbols to maintain the correct ordering of ptr/addr +// input -> BlockArg. +// +// This effectively implements some deprecated OpenMP functionality +// that some legacy applications unfortunately depend on +// (deprecated in specification version 5.2): +// +// "If a list item in a use_device_ptr clause is not of type C_PTR, +// the behavior is as if the list item appeared in a use_device_addr +// clause. Support for such list items in a use_device_ptr clause +// is deprecated." +static void promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr( + mlir::omp::UseDeviceClauseOps &clauseOps, + llvm::SmallVectorImpl &useDeviceTypes, + llvm::SmallVectorImpl &useDeviceLocs, + llvm::SmallVectorImpl + &useDeviceSymbols) { + auto moveElementToBack = [](size_t idx, auto &vector) { + auto *iter = std::next(vector.begin(), idx); + vector.push_back(*iter); + vector.erase(iter); + }; + + // Iterate over our use_device_ptr list and shift all non-cptr arguments into + // use_device_addr. + for (auto *it = clauseOps.useDevicePtrVars.begin(); + it != clauseOps.useDevicePtrVars.end();) { + if (!fir::isa_builtin_cptr_type(fir::unwrapRefType(it->getType()))) { + clauseOps.useDeviceAddrVars.push_back(*it); + // We have to shuffle the symbols around as well, to maintain + // the correct Input -> BlockArg for use_device_ptr/use_device_addr. + // NOTE: However, as map's do not seem to be included currently + // this isn't as pertinent, but we must try to maintain for + // future alterations. I believe the reason they are not currently + // is that the BlockArg assign/lowering needs to be extended + // to a greater set of types. + auto idx = std::distance(clauseOps.useDevicePtrVars.begin(), it); + moveElementToBack(idx, useDeviceTypes); + moveElementToBack(idx, useDeviceLocs); + moveElementToBack(idx, useDeviceSymbols); + it = clauseOps.useDevicePtrVars.erase(it); + continue; + } + ++it; + } +} + +/// Extract the list of function and variable symbols affected by the given +/// 'declare target' directive and return the intended device type for them. +static void getDeclareTargetInfo( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct, + mlir::omp::DeclareTargetClauseOps &clauseOps, + llvm::SmallVectorImpl &symbolAndClause) { + const auto &spec = std::get( + declareTargetConstruct.t); + if (const auto *objectList{ + Fortran::parser::Unwrap(spec.u)}) { + ObjectList objects{makeObjects(*objectList, semaCtx)}; + // Case: declare target(func, var1, var2) + gatherFuncAndVarSyms(objects, mlir::omp::DeclareTargetCaptureClause::to, + symbolAndClause); + } else if (const auto *clauseList{ + Fortran::parser::Unwrap( + spec.u)}) { + if (clauseList->v.empty()) { + // Case: declare target, implicit capture of function + symbolAndClause.emplace_back( + mlir::omp::DeclareTargetCaptureClause::to, + eval.getOwningProcedure()->getSubprogramSymbol()); + } + + ClauseProcessor cp(converter, semaCtx, *clauseList); + cp.processDeviceType(clauseOps); + cp.processEnter(symbolAndClause); + cp.processLink(symbolAndClause); + cp.processTo(symbolAndClause); + + cp.processTODO(converter.getCurrentLocation(), + llvm::omp::Directive::OMPD_declare_target); + } +} + +static void collectDeferredDeclareTargets( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct, + llvm::SmallVectorImpl + &deferredDeclareTarget) { + mlir::omp::DeclareTargetClauseOps clauseOps; + llvm::SmallVector symbolAndClause; + getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct, + clauseOps, symbolAndClause); + // Return the device type only if at least one of the targets for the + // directive is a function or subroutine + mlir::ModuleOp mod = converter.getFirOpBuilder().getModule(); + + for (const DeclareTargetCapturePair &symClause : symbolAndClause) { + mlir::Operation *op = mod.lookupSymbol(converter.mangleName( + std::get(symClause))); + + if (!op) { + deferredDeclareTarget.push_back({std::get<0>(symClause), + clauseOps.deviceType, + std::get<1>(symClause)}); + } + } +} + +static std::optional +getDeclareTargetFunctionDevice( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OpenMPDeclareTargetConstruct + &declareTargetConstruct) { + mlir::omp::DeclareTargetClauseOps clauseOps; + llvm::SmallVector symbolAndClause; + getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct, + clauseOps, symbolAndClause); + + // Return the device type only if at least one of the targets for the + // directive is a function or subroutine + mlir::ModuleOp mod = converter.getFirOpBuilder().getModule(); + for (const DeclareTargetCapturePair &symClause : symbolAndClause) { + mlir::Operation *op = mod.lookupSymbol(converter.mangleName( + std::get(symClause))); + + if (mlir::isa_and_nonnull(op)) + return clauseOps.deviceType; + } + + return std::nullopt; +} + +static llvm::SmallVector +genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter, + mlir::Location &loc, + llvm::ArrayRef args) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + auto ®ion = op->getRegion(0); + + std::size_t loopVarTypeSize = 0; + for (const Fortran::semantics::Symbol *arg : args) + loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size()); + mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize); + llvm::SmallVector tiv(args.size(), loopVarType); + llvm::SmallVector locs(args.size(), loc); + firOpBuilder.createBlock(®ion, {}, tiv, locs); + // The argument is not currently in memory, so make a temporary for the + // argument, and store it there, then bind that location to the argument. + mlir::Operation *storeOp = nullptr; + for (auto [argIndex, argSymbol] : llvm::enumerate(args)) { + mlir::Value indexVal = fir::getBase(region.front().getArgument(argIndex)); + storeOp = + createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol); + } + firOpBuilder.setInsertionPointAfter(storeOp); + return llvm::SmallVector(args); +} + +static void genReductionVars( + mlir::Operation *op, Fortran::lower::AbstractConverter &converter, + mlir::Location &loc, + llvm::ArrayRef reductionArgs, + llvm::ArrayRef reductionTypes) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + llvm::SmallVector blockArgLocs(reductionArgs.size(), loc); + + mlir::Block *entryBlock = firOpBuilder.createBlock( + &op->getRegion(0), {}, reductionTypes, blockArgLocs); + + // Bind the reduction arguments to their block arguments. + for (auto [arg, prv] : + llvm::zip_equal(reductionArgs, entryBlock->getArguments())) { + converter.bindSymbol(*arg, prv); + } +} + +static llvm::SmallVector +genLoopAndReductionVars( + mlir::Operation *op, Fortran::lower::AbstractConverter &converter, + mlir::Location &loc, + llvm::ArrayRef loopArgs, + llvm::ArrayRef reductionArgs, + llvm::ArrayRef reductionTypes) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + + llvm::SmallVector blockArgTypes; + llvm::SmallVector blockArgLocs; + blockArgTypes.reserve(loopArgs.size() + reductionArgs.size()); + blockArgLocs.reserve(blockArgTypes.size()); + mlir::Block *entryBlock; + + if (loopArgs.size()) { + std::size_t loopVarTypeSize = 0; + for (const Fortran::semantics::Symbol *arg : loopArgs) + loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size()); + mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize); + std::fill_n(std::back_inserter(blockArgTypes), loopArgs.size(), + loopVarType); + std::fill_n(std::back_inserter(blockArgLocs), loopArgs.size(), loc); + } + if (reductionArgs.size()) { + llvm::copy(reductionTypes, std::back_inserter(blockArgTypes)); + std::fill_n(std::back_inserter(blockArgLocs), reductionArgs.size(), loc); + } + entryBlock = firOpBuilder.createBlock(&op->getRegion(0), {}, blockArgTypes, + blockArgLocs); + // The argument is not currently in memory, so make a temporary for the + // argument, and store it there, then bind that location to the argument. + if (loopArgs.size()) { + mlir::Operation *storeOp = nullptr; + for (auto [argIndex, argSymbol] : llvm::enumerate(loopArgs)) { + mlir::Value indexVal = + fir::getBase(op->getRegion(0).front().getArgument(argIndex)); + storeOp = + createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol); + } + firOpBuilder.setInsertionPointAfter(storeOp); + } + // Bind the reduction arguments to their block arguments + for (auto [arg, prv] : llvm::zip_equal( + reductionArgs, + llvm::drop_begin(entryBlock->getArguments(), loopArgs.size()))) { + converter.bindSymbol(*arg, prv); + } + + return llvm::SmallVector(loopArgs); +} + +static void +markDeclareTarget(mlir::Operation *op, + Fortran::lower::AbstractConverter &converter, + mlir::omp::DeclareTargetCaptureClause captureClause, + mlir::omp::DeclareTargetDeviceType deviceType) { + // TODO: Add support for program local variables with declare target applied + auto declareTargetOp = llvm::dyn_cast(op); + if (!declareTargetOp) + fir::emitFatalError( + converter.getCurrentLocation(), + "Attempt to apply declare target on unsupported operation"); + + // The function or global already has a declare target applied to it, very + // likely through implicit capture (usage in another declare target + // function/subroutine). It should be marked as any if it has been assigned + // both host and nohost, else we skip, as there is no change + if (declareTargetOp.isDeclareTarget()) { + if (declareTargetOp.getDeclareTargetDeviceType() != deviceType) + declareTargetOp.setDeclareTarget(mlir::omp::DeclareTargetDeviceType::any, + captureClause); + return; + } + + declareTargetOp.setDeclareTarget(deviceType, captureClause); +} + +//===----------------------------------------------------------------------===// +// Op body generation helper structures and functions +//===----------------------------------------------------------------------===// + struct OpWithBodyGenInfo { /// A type for a code-gen callback function. This takes as argument the op for /// which the code is being generated and returns the arguments of the op's @@ -493,548 +763,737 @@ static void genBodyOfTargetDataOp( genNestedEvaluations(converter, eval); } -template -static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) { - auto op = info.converter.getFirOpBuilder().create( - info.loc, std::forward(args)...); - createBodyOfOp(op, info); - return op; -} - -static mlir::omp::MasterOp -genMasterOp(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::Location currentLocation) { - return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) - .setGenNested(genNested)); -} - -static mlir::omp::OrderedRegionOp -genOrderedRegionOp(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::Location currentLocation, - const Fortran::parser::OmpClauseList &clauseList) { - mlir::omp::OrderedRegionClauseOps clauseOps; - - ClauseProcessor cp(converter, semaCtx, clauseList); - cp.processTODO(currentLocation, - llvm::omp::Directive::OMPD_ordered); - - return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) - .setGenNested(genNested), - clauseOps); -} +// This functions creates a block for the body of the targetOp's region. It adds +// all the symbols present in mapSymbols as block arguments to this block. +static void +genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::omp::TargetOp &targetOp, + llvm::ArrayRef mapSyms, + llvm::ArrayRef mapSymLocs, + llvm::ArrayRef mapSymTypes, + const mlir::Location ¤tLocation) { + assert(mapSymTypes.size() == mapSymLocs.size()); -static mlir::omp::ParallelOp -genParallelOp(Fortran::lower::AbstractConverter &converter, - Fortran::lower::SymMap &symTable, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::Location currentLocation, - const Fortran::parser::OmpClauseList &clauseList, - bool outerCombined = false) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - Fortran::lower::StatementContext stmtCtx; - mlir::omp::ParallelClauseOps clauseOps; - llvm::SmallVector privateSyms; - llvm::SmallVector reductionTypes; - llvm::SmallVector reductionSyms; - - ClauseProcessor cp(converter, semaCtx, clauseList); - cp.processIf(llvm::omp::Directive::OMPD_parallel, clauseOps); - cp.processNumThreads(stmtCtx, clauseOps); - cp.processProcBind(clauseOps); - cp.processDefault(); - cp.processAllocate(clauseOps); - - if (!outerCombined) - cp.processReduction(currentLocation, clauseOps, &reductionTypes, - &reductionSyms); + mlir::Region ®ion = targetOp.getRegion(); - if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars)) - clauseOps.reductionByRefAttr = firOpBuilder.getUnitAttr(); + auto *regionBlock = + firOpBuilder.createBlock(®ion, {}, mapSymTypes, mapSymLocs); - auto reductionCallback = [&](mlir::Operation *op) { - llvm::SmallVector locs(clauseOps.reductionVars.size(), - currentLocation); - auto *block = - firOpBuilder.createBlock(&op->getRegion(0), {}, reductionTypes, locs); - for (auto [arg, prv] : - llvm::zip_equal(reductionSyms, block->getArguments())) { - converter.bindSymbol(*arg, prv); + // Clones the `bounds` placing them inside the target region and returns them. + auto cloneBound = [&](mlir::Value bound) { + if (mlir::isMemoryEffectFree(bound.getDefiningOp())) { + mlir::Operation *clonedOp = bound.getDefiningOp()->clone(); + regionBlock->push_back(clonedOp); + return clonedOp->getResult(0); } - return reductionSyms; + TODO(converter.getCurrentLocation(), + "target map clause operand unsupported bound type"); }; - OpWithBodyGenInfo genInfo = - OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) - .setGenNested(genNested) - .setOuterCombined(outerCombined) - .setClauses(&clauseList) - .setReductions(&reductionSyms, &reductionTypes) - .setGenRegionEntryCb(reductionCallback); + auto cloneBounds = [cloneBound](llvm::ArrayRef bounds) { + llvm::SmallVector clonedBounds; + for (mlir::Value bound : bounds) + clonedBounds.emplace_back(cloneBound(bound)); + return clonedBounds; + }; - if (!enableDelayedPrivatization) - return genOpWithBody(genInfo, clauseOps); + // Bind the symbols to their corresponding block arguments. + for (auto [argIndex, argSymbol] : llvm::enumerate(mapSyms)) { + const mlir::BlockArgument &arg = region.getArgument(argIndex); + // Avoid capture of a reference to a structured binding. + const Fortran::semantics::Symbol *sym = argSymbol; + // Structure component symbols don't have bindings. + if (sym->owner().IsDerivedType()) + continue; + fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym); + extVal.match( + [&](const fir::BoxValue &v) { + converter.bindSymbol(*sym, + fir::BoxValue(arg, cloneBounds(v.getLBounds()), + v.getExplicitParameters(), + v.getExplicitExtents())); + }, + [&](const fir::MutableBoxValue &v) { + converter.bindSymbol( + *sym, fir::MutableBoxValue(arg, cloneBounds(v.getLBounds()), + v.getMutableProperties())); + }, + [&](const fir::ArrayBoxValue &v) { + converter.bindSymbol( + *sym, fir::ArrayBoxValue(arg, cloneBounds(v.getExtents()), + cloneBounds(v.getLBounds()), + v.getSourceBox())); + }, + [&](const fir::CharArrayBoxValue &v) { + converter.bindSymbol( + *sym, fir::CharArrayBoxValue(arg, cloneBound(v.getLen()), + cloneBounds(v.getExtents()), + cloneBounds(v.getLBounds()))); + }, + [&](const fir::CharBoxValue &v) { + converter.bindSymbol(*sym, + fir::CharBoxValue(arg, cloneBound(v.getLen()))); + }, + [&](const fir::UnboxedValue &v) { converter.bindSymbol(*sym, arg); }, + [&](const auto &) { + TODO(converter.getCurrentLocation(), + "target map clause operand unsupported type"); + }); + } - bool privatize = !outerCombined; - DataSharingProcessor dsp(converter, semaCtx, clauseList, eval, - /*useDelayedPrivatization=*/true, &symTable); + // Check if cloning the bounds introduced any dependency on the outer region. + // If so, then either clone them as well if they are MemoryEffectFree, or else + // copy them to a new temporary and add them to the map and block_argument + // lists and replace their uses with the new temporary. + llvm::SetVector valuesDefinedAbove; + mlir::getUsedValuesDefinedAbove(region, valuesDefinedAbove); + while (!valuesDefinedAbove.empty()) { + for (mlir::Value val : valuesDefinedAbove) { + mlir::Operation *valOp = val.getDefiningOp(); + if (mlir::isMemoryEffectFree(valOp)) { + mlir::Operation *clonedOp = valOp->clone(); + regionBlock->push_front(clonedOp); + val.replaceUsesWithIf( + clonedOp->getResult(0), [regionBlock](mlir::OpOperand &use) { + return use.getOwner()->getBlock() == regionBlock; + }); + } else { + auto savedIP = firOpBuilder.getInsertionPoint(); + firOpBuilder.setInsertionPointAfter(valOp); + auto copyVal = + firOpBuilder.createTemporary(val.getLoc(), val.getType()); + firOpBuilder.createStoreWithConvert(copyVal.getLoc(), val, copyVal); - if (privatize) - dsp.processStep1(&clauseOps, &privateSyms); + llvm::SmallVector bounds; + std::stringstream name; + firOpBuilder.setInsertionPoint(targetOp); + mlir::Value mapOp = createMapInfoOp( + firOpBuilder, copyVal.getLoc(), copyVal, mlir::Value{}, name.str(), + bounds, llvm::SmallVector{}, + static_cast< + std::underlying_type_t>( + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT), + mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType()); + targetOp.getMapOperandsMutable().append(mapOp); + mlir::Value clonedValArg = + region.addArgument(copyVal.getType(), copyVal.getLoc()); + firOpBuilder.setInsertionPointToStart(regionBlock); + auto loadOp = firOpBuilder.create(clonedValArg.getLoc(), + clonedValArg); + val.replaceUsesWithIf( + loadOp->getResult(0), [regionBlock](mlir::OpOperand &use) { + return use.getOwner()->getBlock() == regionBlock; + }); + firOpBuilder.setInsertionPoint(regionBlock, savedIP); + } + } + valuesDefinedAbove.clear(); + mlir::getUsedValuesDefinedAbove(region, valuesDefinedAbove); + } - auto genRegionEntryCB = [&](mlir::Operation *op) { - auto parallelOp = llvm::cast(op); + // Insert dummy instruction to remember the insertion position. The + // marker will be deleted since there are not uses. + // In the HLFIR flow there are hlfir.declares inserted above while + // setting block arguments. + mlir::Value undefMarker = firOpBuilder.create( + targetOp.getOperation()->getLoc(), firOpBuilder.getIndexType()); - llvm::SmallVector reductionLocs( - clauseOps.reductionVars.size(), currentLocation); + // Create blocks for unstructured regions. This has to be done since + // blocks are initially allocated with the function as the parent region. + if (eval.lowerAsUnstructured()) { + Fortran::lower::createEmptyRegionBlocks( + firOpBuilder, eval.getNestedEvaluations()); + } - mlir::OperandRange privateVars = parallelOp.getPrivateVars(); - mlir::Region ®ion = parallelOp.getRegion(); + firOpBuilder.create(currentLocation); - llvm::SmallVector privateVarTypes = reductionTypes; - privateVarTypes.reserve(privateVarTypes.size() + privateVars.size()); - llvm::transform(privateVars, std::back_inserter(privateVarTypes), - [](mlir::Value v) { return v.getType(); }); + // Create the insertion point after the marker. + firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp()); + if (genNested) + genNestedEvaluations(converter, eval); +} - llvm::SmallVector privateVarLocs = reductionLocs; - privateVarLocs.reserve(privateVarLocs.size() + privateVars.size()); - llvm::transform(privateVars, std::back_inserter(privateVarLocs), - [](mlir::Value v) { return v.getLoc(); }); +template +static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) { + auto op = info.converter.getFirOpBuilder().create( + info.loc, std::forward(args)...); + createBodyOfOp(op, info); + return op; +} - firOpBuilder.createBlock(®ion, /*insertPt=*/{}, privateVarTypes, - privateVarLocs); +//===----------------------------------------------------------------------===// +// Code generation functions for clauses +//===----------------------------------------------------------------------===// - llvm::SmallVector allSymbols = - reductionSyms; - allSymbols.append(privateSyms); - for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) { - converter.bindSymbol(*arg, prv); - } +static void genCriticalDeclareClauses( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + const Fortran::parser::OmpClauseList &clauses, mlir::Location loc, + mlir::omp::CriticalClauseOps &clauseOps, llvm::StringRef name) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processHint(clauseOps); + clauseOps.nameAttr = + mlir::StringAttr::get(converter.getFirOpBuilder().getContext(), name); +} - return allSymbols; - }; +static void genFlushClauses( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + const std::optional &objects, + const std::optional> + &clauses, + mlir::Location loc, llvm::SmallVectorImpl &operandRange) { + if (objects) + genObjectList2(*objects, converter, operandRange); + + if (clauses && clauses->size() > 0) + TODO(converter.getCurrentLocation(), "Handle OmpMemoryOrderClause"); +} - // TODO Merge with the reduction CB. - genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp); - return genOpWithBody(genInfo, clauseOps); +static void +genOrderedRegionClauses(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + const Fortran::parser::OmpClauseList &clauses, + mlir::Location loc, + mlir::omp::OrderedRegionClauseOps &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processTODO(loc, llvm::omp::Directive::OMPD_ordered); +} + +static void genParallelClauses( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::StatementContext &stmtCtx, + const Fortran::parser::OmpClauseList &clauses, mlir::Location loc, + bool processReduction, mlir::omp::ParallelClauseOps &clauseOps, + llvm::SmallVectorImpl &reductionTypes, + llvm::SmallVectorImpl &reductionSyms) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processAllocate(clauseOps); + cp.processDefault(); + cp.processIf(llvm::omp::Directive::OMPD_parallel, clauseOps); + cp.processNumThreads(stmtCtx, clauseOps); + cp.processProcBind(clauseOps); + + if (processReduction) { + cp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms); + if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars)) + clauseOps.reductionByRefAttr = converter.getFirOpBuilder().getUnitAttr(); + } } -static mlir::omp::SectionOp -genSectionOp(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::Location currentLocation, - const Fortran::parser::OmpClauseList §ionsClauseList) { - // Currently only private/firstprivate clause is handled, and - // all privatization is done within `omp.section` operations. - return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) - .setGenNested(genNested) - .setClauses(§ionsClauseList)); +static void genSectionsClauses(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + const Fortran::parser::OmpClauseList &clauses, + mlir::Location loc, + bool clausesFromBeginSections, + mlir::omp::SectionsClauseOps &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); + if (clausesFromBeginSections) { + cp.processAllocate(clauseOps); + cp.processSectionsReduction(loc, clauseOps); + // TODO Support delayed privatization. + } else { + cp.processNowait(clauseOps); + } } -static mlir::omp::SingleOp -genSingleOp(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::Location currentLocation, - const Fortran::parser::OmpClauseList &beginClauseList, - const Fortran::parser::OmpClauseList &endClauseList) { - mlir::omp::SingleClauseOps clauseOps; +static void genSimdLoopClauses( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::StatementContext &stmtCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OmpClauseList &clauses, mlir::Location loc, + mlir::omp::SimdLoopClauseOps &clauseOps, + llvm::SmallVectorImpl &iv) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processCollapse(loc, eval, clauseOps, iv); + cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps); + cp.processReduction(loc, clauseOps); + cp.processSafelen(clauseOps); + cp.processSimdlen(clauseOps); + clauseOps.loopInclusiveAttr = converter.getFirOpBuilder().getUnitAttr(); + // TODO Support delayed privatization. - ClauseProcessor cp(converter, semaCtx, beginClauseList); - cp.processAllocate(clauseOps); + cp.processTODO( + loc, llvm::omp::Directive::OMPD_simd); +} + +static void genSingleClauses(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + const Fortran::parser::OmpClauseList &beginClauses, + const Fortran::parser::OmpClauseList &endClauses, + mlir::Location loc, + mlir::omp::SingleClauseOps &clauseOps) { + ClauseProcessor bcp(converter, semaCtx, beginClauses); + bcp.processAllocate(clauseOps); // TODO Support delayed privatization. - ClauseProcessor ecp(converter, semaCtx, endClauseList); + ClauseProcessor ecp(converter, semaCtx, endClauses); + ecp.processCopyprivate(loc, clauseOps); ecp.processNowait(clauseOps); - ecp.processCopyprivate(currentLocation, clauseOps); +} - return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) - .setGenNested(genNested) - .setClauses(&beginClauseList), - clauseOps); +static void genTargetClauses( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::StatementContext &stmtCtx, + const Fortran::parser::OmpClauseList &clauses, mlir::Location loc, + bool processHostOnlyClauses, bool processReduction, + mlir::omp::TargetClauseOps &clauseOps, + llvm::SmallVectorImpl &mapSyms, + llvm::SmallVectorImpl &mapLocs, + llvm::SmallVectorImpl &mapTypes, + llvm::SmallVectorImpl &deviceAddrSyms, + llvm::SmallVectorImpl &deviceAddrLocs, + llvm::SmallVectorImpl &deviceAddrTypes, + llvm::SmallVectorImpl &devicePtrSyms, + llvm::SmallVectorImpl &devicePtrLocs, + llvm::SmallVectorImpl &devicePtrTypes) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processDepend(clauseOps); + cp.processDevice(stmtCtx, clauseOps); + cp.processHasDeviceAddr(clauseOps, deviceAddrTypes, deviceAddrLocs, + deviceAddrSyms); + cp.processIf(llvm::omp::Directive::OMPD_target, clauseOps); + cp.processIsDevicePtr(clauseOps, devicePtrTypes, devicePtrLocs, + devicePtrSyms); + cp.processMap(loc, stmtCtx, clauseOps, &mapSyms, &mapLocs, &mapTypes); + cp.processThreadLimit(stmtCtx, clauseOps); + // TODO Support delayed privatization. + + if (processHostOnlyClauses) + cp.processNowait(clauseOps); + + cp.processTODO(loc, + llvm::omp::Directive::OMPD_target); } -static mlir::omp::TaskOp -genTaskOp(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::Location currentLocation, - const Fortran::parser::OmpClauseList &clauseList) { - Fortran::lower::StatementContext stmtCtx; - mlir::omp::TaskClauseOps clauseOps; +static void genTargetDataClauses( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::StatementContext &stmtCtx, + const Fortran::parser::OmpClauseList &clauses, mlir::Location loc, + mlir::omp::TargetDataClauseOps &clauseOps, + llvm::SmallVectorImpl &useDeviceTypes, + llvm::SmallVectorImpl &useDeviceLocs, + llvm::SmallVectorImpl &useDeviceSyms) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processDevice(stmtCtx, clauseOps); + cp.processIf(llvm::omp::Directive::OMPD_target_data, clauseOps); + cp.processMap(loc, stmtCtx, clauseOps); + cp.processUseDeviceAddr(clauseOps, useDeviceTypes, useDeviceLocs, + useDeviceSyms); + cp.processUseDevicePtr(clauseOps, useDeviceTypes, useDeviceLocs, + useDeviceSyms); - ClauseProcessor cp(converter, semaCtx, clauseList); - cp.processIf(llvm::omp::Directive::OMPD_task, clauseOps); + // This function implements the deprecated functionality of use_device_ptr + // that allows users to provide non-CPTR arguments to it with the caveat + // that the compiler will treat them as use_device_addr. A lot of legacy + // code may still depend on this functionality, so we should support it + // in some manner. We do so currently by simply shifting non-cptr operands + // from the use_device_ptr list into the front of the use_device_addr list + // whilst maintaining the ordering of useDeviceLocs, useDeviceSyms and + // useDeviceTypes to use_device_ptr/use_device_addr input for BlockArg + // ordering. + // TODO: Perhaps create a user provideable compiler option that will + // re-introduce a hard-error rather than a warning in these cases. + promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(clauseOps, useDeviceTypes, + useDeviceLocs, useDeviceSyms); +} + +static void genTargetEnterExitUpdateDataClauses( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::StatementContext &stmtCtx, + const Fortran::parser::OmpClauseList &clauses, mlir::Location loc, + llvm::omp::Directive directive, + mlir::omp::TargetEnterExitUpdateDataClauseOps &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processDepend(clauseOps); + cp.processDevice(stmtCtx, clauseOps); + cp.processIf(directive, clauseOps); + cp.processNowait(clauseOps); + + if (directive == llvm::omp::Directive::OMPD_target_update) { + cp.processMotionClauses(stmtCtx, clauseOps); + cp.processMotionClauses(stmtCtx, clauseOps); + } else { + cp.processMap(loc, stmtCtx, clauseOps); + } +} + +static void genTaskClauses(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::StatementContext &stmtCtx, + const Fortran::parser::OmpClauseList &clauses, + mlir::Location loc, + mlir::omp::TaskClauseOps &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); cp.processAllocate(clauseOps); cp.processDefault(); + cp.processDepend(clauseOps); cp.processFinal(stmtCtx, clauseOps); - cp.processUntied(clauseOps); + cp.processIf(llvm::omp::Directive::OMPD_task, clauseOps); cp.processMergeable(clauseOps); cp.processPriority(stmtCtx, clauseOps); - cp.processDepend(clauseOps); + cp.processUntied(clauseOps); // TODO Support delayed privatization. - cp.processTODO( - currentLocation, llvm::omp::Directive::OMPD_task); + cp.processTODO( + loc, llvm::omp::Directive::OMPD_task); +} - return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) - .setGenNested(genNested) - .setClauses(&clauseList), - clauseOps); +static void genTaskgroupClauses(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + const Fortran::parser::OmpClauseList &clauses, + mlir::Location loc, + mlir::omp::TaskgroupClauseOps &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processAllocate(clauseOps); + + cp.processTODO(loc, + llvm::omp::Directive::OMPD_taskgroup); } -static mlir::omp::TaskgroupOp -genTaskgroupOp(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::Location currentLocation, - const Fortran::parser::OmpClauseList &clauseList) { - mlir::omp::TaskgroupClauseOps clauseOps; +static void genTaskwaitClauses(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + const Fortran::parser::OmpClauseList &clauses, + mlir::Location loc, + mlir::omp::TaskwaitClauseOps &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processTODO( + loc, llvm::omp::Directive::OMPD_taskwait); +} - ClauseProcessor cp(converter, semaCtx, clauseList); +static void genTeamsClauses(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::StatementContext &stmtCtx, + const Fortran::parser::OmpClauseList &clauses, + mlir::Location loc, + mlir::omp::TeamsClauseOps &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); cp.processAllocate(clauseOps); - cp.processTODO(currentLocation, - llvm::omp::Directive::OMPD_taskgroup); + cp.processDefault(); + cp.processIf(llvm::omp::Directive::OMPD_teams, clauseOps); + cp.processNumTeams(stmtCtx, clauseOps); + cp.processThreadLimit(stmtCtx, clauseOps); + // TODO Support delayed privatization. - return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) - .setGenNested(genNested) - .setClauses(&clauseList), - clauseOps); + cp.processTODO(loc, llvm::omp::Directive::OMPD_teams); } -// This helper function implements the functionality of "promoting" -// non-CPTR arguments of use_device_ptr to use_device_addr -// arguments (automagic conversion of use_device_ptr -> -// use_device_addr in these cases). The way we do so currently is -// through the shuffling of operands from the devicePtrOperands to -// deviceAddrOperands where neccesary and re-organizing the types, -// locations and symbols to maintain the correct ordering of ptr/addr -// input -> BlockArg. -// -// This effectively implements some deprecated OpenMP functionality -// that some legacy applications unfortunately depend on -// (deprecated in specification version 5.2): -// -// "If a list item in a use_device_ptr clause is not of type C_PTR, -// the behavior is as if the list item appeared in a use_device_addr -// clause. Support for such list items in a use_device_ptr clause -// is deprecated." -static void promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr( - mlir::omp::UseDeviceClauseOps &clauseOps, - llvm::SmallVectorImpl &useDeviceTypes, - llvm::SmallVectorImpl &useDeviceLocs, - llvm::SmallVectorImpl - &useDeviceSymbols) { - auto moveElementToBack = [](size_t idx, auto &vector) { - auto *iter = std::next(vector.begin(), idx); - vector.push_back(*iter); - vector.erase(iter); - }; +static void genWsloopClauses( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::StatementContext &stmtCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OmpClauseList &beginClauses, + const Fortran::parser::OmpClauseList *endClauses, mlir::Location loc, + mlir::omp::WsloopClauseOps &clauseOps, + llvm::SmallVectorImpl &iv, + llvm::SmallVectorImpl &reductionTypes, + llvm::SmallVectorImpl &reductionSyms) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + ClauseProcessor bcp(converter, semaCtx, beginClauses); + bcp.processCollapse(loc, eval, clauseOps, iv); + bcp.processOrdered(clauseOps); + bcp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms); + bcp.processSchedule(stmtCtx, clauseOps); + clauseOps.loopInclusiveAttr = firOpBuilder.getUnitAttr(); + // TODO Support delayed privatization. - // Iterate over our use_device_ptr list and shift all non-cptr arguments into - // use_device_addr. - for (auto *it = clauseOps.useDevicePtrVars.begin(); - it != clauseOps.useDevicePtrVars.end();) { - if (!fir::isa_builtin_cptr_type(fir::unwrapRefType(it->getType()))) { - clauseOps.useDeviceAddrVars.push_back(*it); - // We have to shuffle the symbols around as well, to maintain - // the correct Input -> BlockArg for use_device_ptr/use_device_addr. - // NOTE: However, as map's do not seem to be included currently - // this isn't as pertinent, but we must try to maintain for - // future alterations. I believe the reason they are not currently - // is that the BlockArg assign/lowering needs to be extended - // to a greater set of types. - auto idx = std::distance(clauseOps.useDevicePtrVars.begin(), it); - moveElementToBack(idx, useDeviceTypes); - moveElementToBack(idx, useDeviceLocs); - moveElementToBack(idx, useDeviceSymbols); - it = clauseOps.useDevicePtrVars.erase(it); - continue; + if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars)) + clauseOps.reductionByRefAttr = firOpBuilder.getUnitAttr(); + + if (endClauses) { + ClauseProcessor ecp(converter, semaCtx, *endClauses); + ecp.processNowait(clauseOps); + } + + bcp.processTODO( + loc, llvm::omp::Directive::OMPD_do); +} + +//===----------------------------------------------------------------------===// +// Code generation functions for leaf constructs +//===----------------------------------------------------------------------===// + +static mlir::omp::BarrierOp +genBarrierOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc) { + return converter.getFirOpBuilder().create(loc); +} + +static mlir::omp::CriticalOp +genCriticalOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList, + const std::optional &name) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + mlir::FlatSymbolRefAttr nameAttr; + + if (name) { + std::string nameStr = name->ToString(); + mlir::ModuleOp mod = firOpBuilder.getModule(); + auto global = mod.lookupSymbol(nameStr); + if (!global) { + mlir::omp::CriticalClauseOps clauseOps; + genCriticalDeclareClauses(converter, semaCtx, clauseList, loc, clauseOps, + nameStr); + + mlir::OpBuilder modBuilder(mod.getBodyRegion()); + global = modBuilder.create(loc, clauseOps); } - ++it; + nameAttr = mlir::FlatSymbolRefAttr::get(firOpBuilder.getContext(), + global.getSymName()); } + + return genOpWithBody( + OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(genNested), + nameAttr); } -static mlir::omp::TargetDataOp -genTargetDataOp(Fortran::lower::AbstractConverter &converter, +static mlir::omp::DistributeOp +genDistributeOp(Fortran::lower::AbstractConverter &converter, Fortran::semantics::SemanticsContext &semaCtx, Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::Location currentLocation, + mlir::Location loc, const Fortran::parser::OmpClauseList &clauseList) { - Fortran::lower::StatementContext stmtCtx; - mlir::omp::TargetDataClauseOps clauseOps; - llvm::SmallVector useDeviceTypes; - llvm::SmallVector useDeviceLocs; - llvm::SmallVector useDeviceSyms; + TODO(loc, "Distribute construct"); + return nullptr; +} - ClauseProcessor cp(converter, semaCtx, clauseList); - cp.processIf(llvm::omp::Directive::OMPD_target_data, clauseOps); - cp.processDevice(stmtCtx, clauseOps); - cp.processUseDevicePtr(clauseOps, useDeviceTypes, useDeviceLocs, - useDeviceSyms); - cp.processUseDeviceAddr(clauseOps, useDeviceTypes, useDeviceLocs, - useDeviceSyms); +static mlir::omp::FlushOp +genFlushOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc, + const std::optional &objectList, + const std::optional> + &clauseList) { + llvm::SmallVector operandRange; + genFlushClauses(converter, semaCtx, objectList, clauseList, loc, + operandRange); + + return converter.getFirOpBuilder().create( + converter.getCurrentLocation(), operandRange); +} - // This function implements the deprecated functionality of use_device_ptr - // that allows users to provide non-CPTR arguments to it with the caveat - // that the compiler will treat them as use_device_addr. A lot of legacy - // code may still depend on this functionality, so we should support it - // in some manner. We do so currently by simply shifting non-cptr operands - // from the use_device_ptr list into the front of the use_device_addr list - // whilst maintaining the ordering of useDeviceLocs, useDeviceSymbols and - // useDeviceTypes to use_device_ptr/use_device_addr input for BlockArg - // ordering. - // TODO: Perhaps create a user provideable compiler option that will - // re-introduce a hard-error rather than a warning in these cases. - promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(clauseOps, useDeviceTypes, - useDeviceLocs, useDeviceSyms); - cp.processMap(currentLocation, llvm::omp::Directive::OMPD_target_data, - stmtCtx, clauseOps); +static mlir::omp::MasterOp +genMasterOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::Location loc) { + return genOpWithBody( + OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(genNested)); +} + +static mlir::omp::OrderedOp +genOrderedOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { + TODO(loc, "OMPD_ordered"); + return nullptr; +} + +static mlir::omp::OrderedRegionOp +genOrderedRegionOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { + mlir::omp::OrderedRegionClauseOps clauseOps; + genOrderedRegionClauses(converter, semaCtx, clauseList, loc, clauseOps); + + return genOpWithBody( + OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(genNested), + clauseOps); +} + +static mlir::omp::ParallelOp +genParallelOp(Fortran::lower::AbstractConverter &converter, + Fortran::lower::SymMap &symTable, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList, + bool outerCombined = false) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + Fortran::lower::StatementContext stmtCtx; + mlir::omp::ParallelClauseOps clauseOps; + llvm::SmallVector privateSyms; + llvm::SmallVector reductionTypes; + llvm::SmallVector reductionSyms; + genParallelClauses(converter, semaCtx, stmtCtx, clauseList, loc, + /*processReduction=*/!outerCombined, clauseOps, + reductionTypes, reductionSyms); - auto dataOp = converter.getFirOpBuilder().create( - currentLocation, clauseOps); + auto reductionCallback = [&](mlir::Operation *op) { + genReductionVars(op, converter, loc, reductionSyms, reductionTypes); + return reductionSyms; + }; - genBodyOfTargetDataOp(converter, semaCtx, eval, genNested, dataOp, - useDeviceTypes, useDeviceLocs, useDeviceSyms, - currentLocation); - return dataOp; -} + OpWithBodyGenInfo genInfo = + OpWithBodyGenInfo(converter, semaCtx, loc, eval) + .setGenNested(genNested) + .setOuterCombined(outerCombined) + .setClauses(&clauseList) + .setReductions(&reductionSyms, &reductionTypes) + .setGenRegionEntryCb(reductionCallback); -template -static OpTy genTargetEnterExitDataUpdateOp( - Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - mlir::Location currentLocation, - const Fortran::parser::OmpClauseList &clauseList) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - Fortran::lower::StatementContext stmtCtx; - mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps; + if (!enableDelayedPrivatization) + return genOpWithBody(genInfo, clauseOps); - // GCC 9.3.0 emits a (probably) bogus warning about an unused variable. - [[maybe_unused]] llvm::omp::Directive directive; - if constexpr (std::is_same_v) { - directive = llvm::omp::Directive::OMPD_target_enter_data; - } else if constexpr (std::is_same_v) { - directive = llvm::omp::Directive::OMPD_target_exit_data; - } else if constexpr (std::is_same_v) { - directive = llvm::omp::Directive::OMPD_target_update; - } else { - return nullptr; - } + bool privatize = !outerCombined; + DataSharingProcessor dsp(converter, semaCtx, clauseList, eval, + /*useDelayedPrivatization=*/true, &symTable); - ClauseProcessor cp(converter, semaCtx, clauseList); - cp.processIf(directive, clauseOps); - cp.processDevice(stmtCtx, clauseOps); - cp.processDepend(clauseOps); - cp.processNowait(clauseOps); + if (privatize) + dsp.processStep1(&clauseOps, &privateSyms); - if constexpr (std::is_same_v) { - cp.processMotionClauses(stmtCtx, clauseOps); - cp.processMotionClauses(stmtCtx, clauseOps); - } else { - cp.processMap(currentLocation, directive, stmtCtx, clauseOps); - } + auto genRegionEntryCB = [&](mlir::Operation *op) { + auto parallelOp = llvm::cast(op); - return firOpBuilder.create(currentLocation, clauseOps); -} + llvm::SmallVector reductionLocs( + clauseOps.reductionVars.size(), loc); -// This functions creates a block for the body of the targetOp's region. It adds -// all the symbols present in mapSymbols as block arguments to this block. -static void -genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::omp::TargetOp &targetOp, - llvm::ArrayRef mapSyms, - llvm::ArrayRef mapSymLocs, - llvm::ArrayRef mapSymTypes, - const mlir::Location ¤tLocation) { - assert(mapSymTypes.size() == mapSymLocs.size()); + mlir::OperandRange privateVars = parallelOp.getPrivateVars(); + mlir::Region ®ion = parallelOp.getRegion(); - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - mlir::Region ®ion = targetOp.getRegion(); + llvm::SmallVector privateVarTypes = reductionTypes; + privateVarTypes.reserve(privateVarTypes.size() + privateVars.size()); + llvm::transform(privateVars, std::back_inserter(privateVarTypes), + [](mlir::Value v) { return v.getType(); }); - auto *regionBlock = - firOpBuilder.createBlock(®ion, {}, mapSymTypes, mapSymLocs); + llvm::SmallVector privateVarLocs = reductionLocs; + privateVarLocs.reserve(privateVarLocs.size() + privateVars.size()); + llvm::transform(privateVars, std::back_inserter(privateVarLocs), + [](mlir::Value v) { return v.getLoc(); }); - // Clones the `bounds` placing them inside the target region and returns them. - auto cloneBound = [&](mlir::Value bound) { - if (mlir::isMemoryEffectFree(bound.getDefiningOp())) { - mlir::Operation *clonedOp = bound.getDefiningOp()->clone(); - regionBlock->push_back(clonedOp); - return clonedOp->getResult(0); + firOpBuilder.createBlock(®ion, /*insertPt=*/{}, privateVarTypes, + privateVarLocs); + + llvm::SmallVector allSymbols = + reductionSyms; + allSymbols.append(privateSyms); + for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) { + converter.bindSymbol(*arg, prv); } - TODO(converter.getCurrentLocation(), - "target map clause operand unsupported bound type"); - }; - auto cloneBounds = [cloneBound](llvm::ArrayRef bounds) { - llvm::SmallVector clonedBounds; - for (mlir::Value bound : bounds) - clonedBounds.emplace_back(cloneBound(bound)); - return clonedBounds; + return allSymbols; }; - // Bind the symbols to their corresponding block arguments. - for (auto [argIndex, argSymbol] : llvm::enumerate(mapSyms)) { - const mlir::BlockArgument &arg = region.getArgument(argIndex); - // Avoid capture of a reference to a structured binding. - const Fortran::semantics::Symbol *sym = argSymbol; - // Structure component symbols don't have bindings. - if (sym->owner().IsDerivedType()) - continue; - fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym); - extVal.match( - [&](const fir::BoxValue &v) { - converter.bindSymbol(*sym, - fir::BoxValue(arg, cloneBounds(v.getLBounds()), - v.getExplicitParameters(), - v.getExplicitExtents())); - }, - [&](const fir::MutableBoxValue &v) { - converter.bindSymbol( - *sym, fir::MutableBoxValue(arg, cloneBounds(v.getLBounds()), - v.getMutableProperties())); - }, - [&](const fir::ArrayBoxValue &v) { - converter.bindSymbol( - *sym, fir::ArrayBoxValue(arg, cloneBounds(v.getExtents()), - cloneBounds(v.getLBounds()), - v.getSourceBox())); - }, - [&](const fir::CharArrayBoxValue &v) { - converter.bindSymbol( - *sym, fir::CharArrayBoxValue(arg, cloneBound(v.getLen()), - cloneBounds(v.getExtents()), - cloneBounds(v.getLBounds()))); - }, - [&](const fir::CharBoxValue &v) { - converter.bindSymbol(*sym, - fir::CharBoxValue(arg, cloneBound(v.getLen()))); - }, - [&](const fir::UnboxedValue &v) { converter.bindSymbol(*sym, arg); }, - [&](const auto &) { - TODO(converter.getCurrentLocation(), - "target map clause operand unsupported type"); - }); - } + // TODO Merge with the reduction CB. + genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp); + return genOpWithBody(genInfo, clauseOps); +} - // Check if cloning the bounds introduced any dependency on the outer region. - // If so, then either clone them as well if they are MemoryEffectFree, or else - // copy them to a new temporary and add them to the map and block_argument - // lists and replace their uses with the new temporary. - llvm::SetVector valuesDefinedAbove; - mlir::getUsedValuesDefinedAbove(region, valuesDefinedAbove); - while (!valuesDefinedAbove.empty()) { - for (mlir::Value val : valuesDefinedAbove) { - mlir::Operation *valOp = val.getDefiningOp(); - if (mlir::isMemoryEffectFree(valOp)) { - mlir::Operation *clonedOp = valOp->clone(); - regionBlock->push_front(clonedOp); - val.replaceUsesWithIf( - clonedOp->getResult(0), [regionBlock](mlir::OpOperand &use) { - return use.getOwner()->getBlock() == regionBlock; - }); - } else { - auto savedIP = firOpBuilder.getInsertionPoint(); - firOpBuilder.setInsertionPointAfter(valOp); - auto copyVal = - firOpBuilder.createTemporary(val.getLoc(), val.getType()); - firOpBuilder.createStoreWithConvert(copyVal.getLoc(), val, copyVal); +static mlir::omp::SectionOp +genSectionOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { + // Currently only private/firstprivate clause is handled, and + // all privatization is done within `omp.section` operations. + return genOpWithBody( + OpWithBodyGenInfo(converter, semaCtx, loc, eval) + .setGenNested(genNested) + .setClauses(&clauseList)); +} - llvm::SmallVector bounds; - std::stringstream name; - firOpBuilder.setInsertionPoint(targetOp); - mlir::Value mapOp = createMapInfoOp( - firOpBuilder, copyVal.getLoc(), copyVal, mlir::Value{}, name.str(), - bounds, llvm::SmallVector{}, - static_cast< - std::underlying_type_t>( - llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT), - mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType()); - targetOp.getMapOperandsMutable().append(mapOp); - mlir::Value clonedValArg = - region.addArgument(copyVal.getType(), copyVal.getLoc()); - firOpBuilder.setInsertionPointToStart(regionBlock); - auto loadOp = firOpBuilder.create(clonedValArg.getLoc(), - clonedValArg); - val.replaceUsesWithIf( - loadOp->getResult(0), [regionBlock](mlir::OpOperand &use) { - return use.getOwner()->getBlock() == regionBlock; - }); - firOpBuilder.setInsertionPoint(regionBlock, savedIP); - } - } - valuesDefinedAbove.clear(); - mlir::getUsedValuesDefinedAbove(region, valuesDefinedAbove); - } +static mlir::omp::SectionsOp +genSectionsOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc, + const mlir::omp::SectionsClauseOps &clauseOps) { + return genOpWithBody( + OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(false), + clauseOps); +} - // Insert dummy instruction to remember the insertion position. The - // marker will be deleted since there are not uses. - // In the HLFIR flow there are hlfir.declares inserted above while - // setting block arguments. - mlir::Value undefMarker = firOpBuilder.create( - targetOp.getOperation()->getLoc(), firOpBuilder.getIndexType()); +static mlir::omp::SimdLoopOp +genSimdLoopOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { + DataSharingProcessor dsp(converter, semaCtx, clauseList, eval); + dsp.processStep1(); - // Create blocks for unstructured regions. This has to be done since - // blocks are initially allocated with the function as the parent region. - if (eval.lowerAsUnstructured()) { - Fortran::lower::createEmptyRegionBlocks( - firOpBuilder, eval.getNestedEvaluations()); - } + Fortran::lower::StatementContext stmtCtx; + mlir::omp::SimdLoopClauseOps clauseOps; + llvm::SmallVector iv; + genSimdLoopClauses(converter, semaCtx, stmtCtx, eval, clauseList, loc, + clauseOps, iv); - firOpBuilder.create(currentLocation); + auto *nestedEval = + getCollapsedLoopEval(eval, Fortran::lower::getCollapseValue(clauseList)); - // Create the insertion point after the marker. - firOpBuilder.setInsertionPointAfter(undefMarker.getDefiningOp()); - if (genNested) - genNestedEvaluations(converter, eval); + auto ivCallback = [&](mlir::Operation *op) { + return genLoopVars(op, converter, loc, iv); + }; + + return genOpWithBody( + OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval) + .setClauses(&clauseList) + .setDataSharingProcessor(&dsp) + .setGenRegionEntryCb(ivCallback), + clauseOps); +} + +static mlir::omp::SingleOp +genSingleOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::Location loc, + const Fortran::parser::OmpClauseList &beginClauseList, + const Fortran::parser::OmpClauseList &endClauseList) { + mlir::omp::SingleClauseOps clauseOps; + genSingleClauses(converter, semaCtx, beginClauseList, endClauseList, loc, + clauseOps); + + return genOpWithBody( + OpWithBodyGenInfo(converter, semaCtx, loc, eval) + .setGenNested(genNested) + .setClauses(&beginClauseList), + clauseOps); } static mlir::omp::TargetOp genTargetOp(Fortran::lower::AbstractConverter &converter, Fortran::semantics::SemanticsContext &semaCtx, Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::Location currentLocation, + mlir::Location loc, const Fortran::parser::OmpClauseList &clauseList, - llvm::omp::Directive directive, bool outerCombined = false) { + bool outerCombined = false) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); Fortran::lower::StatementContext stmtCtx; + + bool processHostOnlyClauses = + !llvm::cast(*converter.getModuleOp()) + .getIsTargetDevice(); + mlir::omp::TargetClauseOps clauseOps; - llvm::SmallVector mapTypes, devicePtrTypes, deviceAddrTypes; - llvm::SmallVector mapLocs, devicePtrLocs, deviceAddrLocs; llvm::SmallVector mapSyms, devicePtrSyms, deviceAddrSyms; - - ClauseProcessor cp(converter, semaCtx, clauseList); - cp.processIf(llvm::omp::Directive::OMPD_target, clauseOps); - cp.processDevice(stmtCtx, clauseOps); - cp.processThreadLimit(stmtCtx, clauseOps); - cp.processDepend(clauseOps); - cp.processNowait(clauseOps); - cp.processMap(currentLocation, directive, stmtCtx, clauseOps, &mapSyms, - &mapLocs, &mapTypes); - cp.processIsDevicePtr(clauseOps, devicePtrTypes, devicePtrLocs, - devicePtrSyms); - cp.processHasDeviceAddr(clauseOps, deviceAddrTypes, deviceAddrLocs, - deviceAddrSyms); - // TODO Support delayed privatization. - - cp.processTODO(currentLocation, - llvm::omp::Directive::OMPD_target); + llvm::SmallVector mapLocs, devicePtrLocs, deviceAddrLocs; + llvm::SmallVector mapTypes, devicePtrTypes, deviceAddrTypes; + genTargetClauses(converter, semaCtx, stmtCtx, clauseList, loc, + processHostOnlyClauses, /*processReduction=*/outerCombined, + clauseOps, mapSyms, mapLocs, mapTypes, deviceAddrSyms, + deviceAddrLocs, deviceAddrTypes, devicePtrSyms, + devicePtrLocs, devicePtrTypes); // 5.8.1 Implicit Data-Mapping Attribute Rules // The following code follows the implicit data-mapping rules to map all the @@ -1056,22 +1515,21 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(sym); name << sym.name().ToString(); - Fortran::lower::AddrAndBoundsInfo info = - getDataOperandBaseAddr(converter, converter.getFirOpBuilder(), sym, - converter.getCurrentLocation()); + Fortran::lower::AddrAndBoundsInfo info = getDataOperandBaseAddr( + converter, firOpBuilder, sym, converter.getCurrentLocation()); if (fir::unwrapRefType(info.addr.getType()).isa()) bounds = Fortran::lower::genBoundsOpsFromBox( - converter.getFirOpBuilder(), converter.getCurrentLocation(), - converter, dataExv, info); + firOpBuilder, converter.getCurrentLocation(), converter, + dataExv, info); if (fir::unwrapRefType(info.addr.getType()).isa()) { bool dataExvIsAssumedSize = Fortran::semantics::IsAssumedSizeArray(sym.GetUltimate()); bounds = Fortran::lower::genBaseBoundsOps( - converter.getFirOpBuilder(), converter.getCurrentLocation(), - converter, dataExv, dataExvIsAssumedSize); + firOpBuilder, converter.getCurrentLocation(), converter, dataExv, + dataExvIsAssumedSize); } llvm::omp::OpenMPOffloadMappingFlags mapFlag = @@ -1085,7 +1543,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, // If a variable is specified in declare target link and if device // type is not specified as `nohost`, it needs to be mapped tofrom - mlir::ModuleOp mod = converter.getFirOpBuilder().getModule(); + mlir::ModuleOp mod = firOpBuilder.getModule(); mlir::Operation *op = mod.lookupSymbol(converter.mangleName(sym)); auto declareTargetOp = llvm::dyn_cast_if_present(op); @@ -1105,8 +1563,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, } mlir::Value mapOp = createMapInfoOp( - converter.getFirOpBuilder(), baseOp.getLoc(), baseOp, mlir::Value{}, - name.str(), bounds, {}, + firOpBuilder, baseOp.getLoc(), baseOp, mlir::Value{}, name.str(), + bounds, {}, static_cast< std::underlying_type_t>( mapFlag), @@ -1118,341 +1576,147 @@ genTargetOp(Fortran::lower::AbstractConverter &converter, mapTypes.push_back(baseOp.getType()); } } - }; - Fortran::lower::pft::visitAllSymbols(eval, captureImplicitMap); - - auto targetOp = converter.getFirOpBuilder().create( - currentLocation, clauseOps); - - genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSyms, - mapLocs, mapTypes, currentLocation); - - return targetOp; -} - -static mlir::omp::TeamsOp -genTeamsOp(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, bool genNested, - mlir::Location currentLocation, - const Fortran::parser::OmpClauseList &clauseList, - bool outerCombined = false) { - Fortran::lower::StatementContext stmtCtx; - mlir::omp::TeamsClauseOps clauseOps; - - ClauseProcessor cp(converter, semaCtx, clauseList); - cp.processIf(llvm::omp::Directive::OMPD_teams, clauseOps); - cp.processAllocate(clauseOps); - cp.processDefault(); - cp.processNumTeams(stmtCtx, clauseOps); - cp.processThreadLimit(stmtCtx, clauseOps); - // TODO Support delayed privatization. - - cp.processTODO(currentLocation, - llvm::omp::Directive::OMPD_teams); - - return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) - .setGenNested(genNested) - .setOuterCombined(outerCombined) - .setClauses(&clauseList), - clauseOps); -} - -/// Extract the list of function and variable symbols affected by the given -/// 'declare target' directive and return the intended device type for them. -static void getDeclareTargetInfo( - Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, - const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct, - mlir::omp::DeclareTargetClauseOps &clauseOps, - llvm::SmallVectorImpl &symbolAndClause) { - const auto &spec = std::get( - declareTargetConstruct.t); - if (const auto *objectList{ - Fortran::parser::Unwrap(spec.u)}) { - ObjectList objects{makeObjects(*objectList, semaCtx)}; - // Case: declare target(func, var1, var2) - gatherFuncAndVarSyms(objects, mlir::omp::DeclareTargetCaptureClause::to, - symbolAndClause); - } else if (const auto *clauseList{ - Fortran::parser::Unwrap( - spec.u)}) { - if (clauseList->v.empty()) { - // Case: declare target, implicit capture of function - symbolAndClause.emplace_back( - mlir::omp::DeclareTargetCaptureClause::to, - eval.getOwningProcedure()->getSubprogramSymbol()); - } - - ClauseProcessor cp(converter, semaCtx, *clauseList); - cp.processTo(symbolAndClause); - cp.processEnter(symbolAndClause); - cp.processLink(symbolAndClause); - cp.processDeviceType(clauseOps); - cp.processTODO(converter.getCurrentLocation(), - llvm::omp::Directive::OMPD_declare_target); - } -} - -static void collectDeferredDeclareTargets( - Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, - const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct, - llvm::SmallVectorImpl - &deferredDeclareTarget) { - mlir::omp::DeclareTargetClauseOps clauseOps; - llvm::SmallVector symbolAndClause; - getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct, - clauseOps, symbolAndClause); - // Return the device type only if at least one of the targets for the - // directive is a function or subroutine - mlir::ModuleOp mod = converter.getFirOpBuilder().getModule(); - - for (const DeclareTargetCapturePair &symClause : symbolAndClause) { - mlir::Operation *op = mod.lookupSymbol(converter.mangleName( - std::get(symClause))); - - if (!op) { - deferredDeclareTarget.push_back({std::get<0>(symClause), - clauseOps.deviceType, - std::get<1>(symClause)}); - } - } -} - -static std::optional -getDeclareTargetFunctionDevice( - Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, - const Fortran::parser::OpenMPDeclareTargetConstruct - &declareTargetConstruct) { - mlir::omp::DeclareTargetClauseOps clauseOps; - llvm::SmallVector symbolAndClause; - getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct, - clauseOps, symbolAndClause); - - // Return the device type only if at least one of the targets for the - // directive is a function or subroutine - mlir::ModuleOp mod = converter.getFirOpBuilder().getModule(); - for (const DeclareTargetCapturePair &symClause : symbolAndClause) { - mlir::Operation *op = mod.lookupSymbol(converter.mangleName( - std::get(symClause))); - - if (mlir::isa_and_nonnull(op)) - return clauseOps.deviceType; - } - - return std::nullopt; -} - -//===----------------------------------------------------------------------===// -// genOMP() Code generation helper functions -//===----------------------------------------------------------------------===// - -static void -genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, bool genNested, - const Fortran::parser::OpenMPSimpleStandaloneConstruct - &simpleStandaloneConstruct) { - const auto &directive = - std::get( - simpleStandaloneConstruct.t); - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - const auto &opClauseList = - std::get(simpleStandaloneConstruct.t); - mlir::Location currentLocation = converter.genLocation(directive.source); - - switch (directive.v) { - default: - break; - case llvm::omp::Directive::OMPD_barrier: - firOpBuilder.create(currentLocation); - break; - case llvm::omp::Directive::OMPD_taskwait: { - mlir::omp::TaskwaitClauseOps clauseOps; - ClauseProcessor cp(converter, semaCtx, opClauseList); - cp.processTODO( - currentLocation, llvm::omp::Directive::OMPD_taskwait); - firOpBuilder.create(currentLocation, clauseOps); - break; - } - case llvm::omp::Directive::OMPD_taskyield: - firOpBuilder.create(currentLocation); - break; - case llvm::omp::Directive::OMPD_target_data: - genTargetDataOp(converter, semaCtx, eval, genNested, currentLocation, - opClauseList); - break; - case llvm::omp::Directive::OMPD_target_enter_data: - genTargetEnterExitDataUpdateOp( - converter, semaCtx, currentLocation, opClauseList); - break; - case llvm::omp::Directive::OMPD_target_exit_data: - genTargetEnterExitDataUpdateOp( - converter, semaCtx, currentLocation, opClauseList); - break; - case llvm::omp::Directive::OMPD_target_update: - genTargetEnterExitDataUpdateOp( - converter, semaCtx, currentLocation, opClauseList); - break; - case llvm::omp::Directive::OMPD_ordered: - TODO(currentLocation, "OMPD_ordered"); - } -} - -static void -genOmpFlush(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, - const Fortran::parser::OpenMPFlushConstruct &flushConstruct) { - llvm::SmallVector operandRange; - if (const auto &ompObjectList = - std::get>( - flushConstruct.t)) - genObjectList2(*ompObjectList, converter, operandRange); - const auto &memOrderClause = - std::get>>( - flushConstruct.t); - if (memOrderClause && memOrderClause->size() > 0) - TODO(converter.getCurrentLocation(), "Handle OmpMemoryOrderClause"); - converter.getFirOpBuilder().create( - converter.getCurrentLocation(), operandRange); -} - -static llvm::SmallVector -genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter, - mlir::Location &loc, - llvm::ArrayRef args) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - auto ®ion = op->getRegion(0); - - std::size_t loopVarTypeSize = 0; - for (const Fortran::semantics::Symbol *arg : args) - loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size()); - mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize); - llvm::SmallVector tiv(args.size(), loopVarType); - llvm::SmallVector locs(args.size(), loc); - firOpBuilder.createBlock(®ion, {}, tiv, locs); - // The argument is not currently in memory, so make a temporary for the - // argument, and store it there, then bind that location to the argument. - mlir::Operation *storeOp = nullptr; - for (auto [argIndex, argSymbol] : llvm::enumerate(args)) { - mlir::Value indexVal = fir::getBase(region.front().getArgument(argIndex)); - storeOp = - createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol); - } - firOpBuilder.setInsertionPointAfter(storeOp); - - return llvm::SmallVector(args); -} - -static llvm::SmallVector -genLoopAndReductionVars( - mlir::Operation *op, Fortran::lower::AbstractConverter &converter, - mlir::Location &loc, - llvm::ArrayRef loopArgs, - llvm::ArrayRef reductionArgs, - llvm::ArrayRef reductionTypes) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - - llvm::SmallVector blockArgTypes; - llvm::SmallVector blockArgLocs; - blockArgTypes.reserve(loopArgs.size() + reductionArgs.size()); - blockArgLocs.reserve(blockArgTypes.size()); - mlir::Block *entryBlock; - - if (loopArgs.size()) { - std::size_t loopVarTypeSize = 0; - for (const Fortran::semantics::Symbol *arg : loopArgs) - loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size()); - mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize); - std::fill_n(std::back_inserter(blockArgTypes), loopArgs.size(), - loopVarType); - std::fill_n(std::back_inserter(blockArgLocs), loopArgs.size(), loc); - } - if (reductionArgs.size()) { - llvm::copy(reductionTypes, std::back_inserter(blockArgTypes)); - std::fill_n(std::back_inserter(blockArgLocs), reductionArgs.size(), loc); - } - entryBlock = firOpBuilder.createBlock(&op->getRegion(0), {}, blockArgTypes, - blockArgLocs); - // The argument is not currently in memory, so make a temporary for the - // argument, and store it there, then bind that location to the argument. - if (loopArgs.size()) { - mlir::Operation *storeOp = nullptr; - for (auto [argIndex, argSymbol] : llvm::enumerate(loopArgs)) { - mlir::Value indexVal = - fir::getBase(op->getRegion(0).front().getArgument(argIndex)); - storeOp = - createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol); - } - firOpBuilder.setInsertionPointAfter(storeOp); - } - // Bind the reduction arguments to their block arguments - for (auto [arg, prv] : llvm::zip_equal( - reductionArgs, - llvm::drop_begin(entryBlock->getArguments(), loopArgs.size()))) { - converter.bindSymbol(*arg, prv); - } + }; + Fortran::lower::pft::visitAllSymbols(eval, captureImplicitMap); - return llvm::SmallVector(loopArgs); + auto targetOp = firOpBuilder.create(loc, clauseOps); + genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSyms, + mapLocs, mapTypes, loc); + return targetOp; } -static void -createSimdLoop(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, - llvm::omp::Directive ompDirective, - const Fortran::parser::OmpClauseList &loopOpClauseList, - mlir::Location loc) { +static mlir::omp::TargetDataOp +genTargetDataOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { + Fortran::lower::StatementContext stmtCtx; + mlir::omp::TargetDataClauseOps clauseOps; + llvm::SmallVector useDeviceTypes; + llvm::SmallVector useDeviceLocs; + llvm::SmallVector useDeviceSyms; + genTargetDataClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps, + useDeviceTypes, useDeviceLocs, useDeviceSyms); + + auto targetDataOp = + converter.getFirOpBuilder().create(loc, + clauseOps); + genBodyOfTargetDataOp(converter, semaCtx, eval, genNested, targetDataOp, + useDeviceTypes, useDeviceLocs, useDeviceSyms, loc); + return targetDataOp; +} + +template +static OpTy genTargetEnterExitUpdateDataOp( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - DataSharingProcessor dsp(converter, semaCtx, loopOpClauseList, eval); - dsp.processStep1(); + Fortran::lower::StatementContext stmtCtx; + + // GCC 9.3.0 emits a (probably) bogus warning about an unused variable. + [[maybe_unused]] llvm::omp::Directive directive; + if constexpr (std::is_same_v) { + directive = llvm::omp::Directive::OMPD_target_enter_data; + } else if constexpr (std::is_same_v) { + directive = llvm::omp::Directive::OMPD_target_exit_data; + } else if constexpr (std::is_same_v) { + directive = llvm::omp::Directive::OMPD_target_update; + } else { + llvm_unreachable("Unexpected TARGET DATA construct"); + } + + mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps; + genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx, clauseList, + loc, directive, clauseOps); + + return firOpBuilder.create(loc, clauseOps); +} +static mlir::omp::TaskOp +genTaskOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { Fortran::lower::StatementContext stmtCtx; - mlir::omp::SimdLoopClauseOps clauseOps; - llvm::SmallVector iv; + mlir::omp::TaskClauseOps clauseOps; + genTaskClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps); - ClauseProcessor cp(converter, semaCtx, loopOpClauseList); - cp.processCollapse(loc, eval, clauseOps, iv); - cp.processReduction(loc, clauseOps); - cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps); - cp.processSimdlen(clauseOps); - cp.processSafelen(clauseOps); - clauseOps.loopInclusiveAttr = firOpBuilder.getUnitAttr(); - // TODO Support delayed privatization. + return genOpWithBody( + OpWithBodyGenInfo(converter, semaCtx, loc, eval) + .setGenNested(genNested) + .setClauses(&clauseList), + clauseOps); +} - cp.processTODO(loc, ompDirective); +static mlir::omp::TaskgroupOp +genTaskgroupOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { + mlir::omp::TaskgroupClauseOps clauseOps; + genTaskgroupClauses(converter, semaCtx, clauseList, loc, clauseOps); - auto *nestedEval = getCollapsedLoopEval( - eval, Fortran::lower::getCollapseValue(loopOpClauseList)); + return genOpWithBody( + OpWithBodyGenInfo(converter, semaCtx, loc, eval) + .setGenNested(genNested) + .setClauses(&clauseList), + clauseOps); +} - auto ivCallback = [&](mlir::Operation *op) { - return genLoopVars(op, converter, loc, iv); - }; +static mlir::omp::TaskloopOp +genTaskloopOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { + TODO(loc, "Taskloop construct"); +} - genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval) - .setClauses(&loopOpClauseList) - .setDataSharingProcessor(&dsp) - .setGenRegionEntryCb(ivCallback), +static mlir::omp::TaskwaitOp +genTaskwaitOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { + mlir::omp::TaskwaitClauseOps clauseOps; + genTaskwaitClauses(converter, semaCtx, clauseList, loc, clauseOps); + return converter.getFirOpBuilder().create(loc, + clauseOps); +} + +static mlir::omp::TaskyieldOp +genTaskyieldOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc) { + return converter.getFirOpBuilder().create(loc); +} + +static mlir::omp::TeamsOp +genTeamsOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, bool genNested, + mlir::Location loc, const Fortran::parser::OmpClauseList &clauseList, + bool outerCombined = false) { + Fortran::lower::StatementContext stmtCtx; + mlir::omp::TeamsClauseOps clauseOps; + genTeamsClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps); + + return genOpWithBody( + OpWithBodyGenInfo(converter, semaCtx, loc, eval) + .setGenNested(genNested) + .setOuterCombined(outerCombined) + .setClauses(&clauseList), clauseOps); } -static void createWsloop(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, - llvm::omp::Directive ompDirective, - const Fortran::parser::OmpClauseList &beginClauseList, - const Fortran::parser::OmpClauseList *endClauseList, - mlir::Location loc) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); +static mlir::omp::WsloopOp +genWsloopOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc, + const Fortran::parser::OmpClauseList &beginClauseList, + const Fortran::parser::OmpClauseList *endClauseList) { DataSharingProcessor dsp(converter, semaCtx, beginClauseList, eval); dsp.processStep1(); @@ -1461,30 +1725,9 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter, llvm::SmallVector iv; llvm::SmallVector reductionTypes; llvm::SmallVector reductionSyms; - - ClauseProcessor cp(converter, semaCtx, beginClauseList); - cp.processCollapse(loc, eval, clauseOps, iv); - cp.processSchedule(stmtCtx, clauseOps); - cp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms); - cp.processOrdered(clauseOps); - clauseOps.loopInclusiveAttr = firOpBuilder.getUnitAttr(); - // TODO Support delayed privatization. - - if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars)) - clauseOps.reductionByRefAttr = firOpBuilder.getUnitAttr(); - - cp.processTODO(loc, - ompDirective); - - // In FORTRAN `nowait` clause occur at the end of `omp do` directive. - // i.e - // !$omp do - // <...> - // !$omp end do nowait - if (endClauseList) { - ClauseProcessor ecp(converter, semaCtx, *endClauseList); - ecp.processNowait(clauseOps); - } + genWsloopClauses(converter, semaCtx, stmtCtx, eval, beginClauseList, + endClauseList, loc, clauseOps, iv, reductionTypes, + reductionSyms); auto *nestedEval = getCollapsedLoopEval( eval, Fortran::lower::getCollapseValue(beginClauseList)); @@ -1494,7 +1737,7 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter, reductionTypes); }; - genOpWithBody( + return genOpWithBody( OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval) .setClauses(&beginClauseList) .setDataSharingProcessor(&dsp) @@ -1503,7 +1746,11 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter, clauseOps); } -static void createSimdWsloop( +//===----------------------------------------------------------------------===// +// Code generation functions for composite constructs +//===----------------------------------------------------------------------===// + +static void genCompositeDoSimd( Fortran::lower::AbstractConverter &converter, Fortran::semantics::SemanticsContext &semaCtx, Fortran::lower::pft::Evaluation &eval, llvm::omp::Directive ompDirective, @@ -1511,7 +1758,7 @@ static void createSimdWsloop( const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) { ClauseProcessor cp(converter, semaCtx, beginClauseList); cp.processTODO(loc, + clause::Order, clause::Safelen, clause::Simdlen>(loc, ompDirective); // TODO: Add support for vectorization - add vectorization hints inside loop // body. @@ -1521,34 +1768,7 @@ static void createSimdWsloop( // When support for vectorization is enabled, then we need to add handling of // if clause. Currently if clause can be skipped because we always assume // SIMD length = 1. - createWsloop(converter, semaCtx, eval, ompDirective, beginClauseList, - endClauseList, loc); -} - -static void -markDeclareTarget(mlir::Operation *op, - Fortran::lower::AbstractConverter &converter, - mlir::omp::DeclareTargetCaptureClause captureClause, - mlir::omp::DeclareTargetDeviceType deviceType) { - // TODO: Add support for program local variables with declare target applied - auto declareTargetOp = llvm::dyn_cast(op); - if (!declareTargetOp) - fir::emitFatalError( - converter.getCurrentLocation(), - "Attempt to apply declare target on unsupported operation"); - - // The function or global already has a declare target applied to it, very - // likely through implicit capture (usage in another declare target - // function/subroutine). It should be marked as any if it has been assigned - // both host and nohost, else we skip, as there is no change - if (declareTargetOp.isDeclareTarget()) { - if (declareTargetOp.getDeclareTargetDeviceType() != deviceType) - declareTargetOp.setDeclareTarget(mlir::omp::DeclareTargetDeviceType::any, - captureClause); - return; - } - - declareTargetOp.setDeclareTarget(deviceType, captureClause); + genWsloopOp(converter, semaCtx, eval, loc, beginClauseList, endClauseList); } //===----------------------------------------------------------------------===// @@ -1643,6 +1863,102 @@ genOMP(Fortran::lower::AbstractConverter &converter, ompDeclConstruct.u); } +//===----------------------------------------------------------------------===// +// OpenMPStandaloneConstruct visitors +//===----------------------------------------------------------------------===// + +static void genOMP(Fortran::lower::AbstractConverter &converter, + Fortran::lower::SymMap &symTable, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OpenMPSimpleStandaloneConstruct + &simpleStandaloneConstruct) { + const auto &directive = + std::get( + simpleStandaloneConstruct.t); + const auto &clauseList = + std::get(simpleStandaloneConstruct.t); + mlir::Location currentLocation = converter.genLocation(directive.source); + + switch (directive.v) { + default: + break; + case llvm::omp::Directive::OMPD_barrier: + genBarrierOp(converter, semaCtx, eval, currentLocation); + break; + case llvm::omp::Directive::OMPD_taskwait: + genTaskwaitOp(converter, semaCtx, eval, currentLocation, clauseList); + break; + case llvm::omp::Directive::OMPD_taskyield: + genTaskyieldOp(converter, semaCtx, eval, currentLocation); + break; + case llvm::omp::Directive::OMPD_target_data: + genTargetDataOp(converter, semaCtx, eval, /*genNested=*/true, + currentLocation, clauseList); + break; + case llvm::omp::Directive::OMPD_target_enter_data: + genTargetEnterExitUpdateDataOp( + converter, semaCtx, currentLocation, clauseList); + break; + case llvm::omp::Directive::OMPD_target_exit_data: + genTargetEnterExitUpdateDataOp( + converter, semaCtx, currentLocation, clauseList); + break; + case llvm::omp::Directive::OMPD_target_update: + genTargetEnterExitUpdateDataOp( + converter, semaCtx, currentLocation, clauseList); + break; + case llvm::omp::Directive::OMPD_ordered: + genOrderedOp(converter, semaCtx, eval, currentLocation, clauseList); + break; + } +} + +static void +genOMP(Fortran::lower::AbstractConverter &converter, + Fortran::lower::SymMap &symTable, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OpenMPFlushConstruct &flushConstruct) { + const auto &verbatim = std::get(flushConstruct.t); + const auto &objectList = + std::get>(flushConstruct.t); + const auto &clauseList = + std::get>>( + flushConstruct.t); + mlir::Location currentLocation = converter.genLocation(verbatim.source); + genFlushOp(converter, semaCtx, eval, currentLocation, objectList, clauseList); +} + +static void +genOMP(Fortran::lower::AbstractConverter &converter, + Fortran::lower::SymMap &symTable, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OpenMPCancelConstruct &cancelConstruct) { + TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct"); +} + +static void genOMP(Fortran::lower::AbstractConverter &converter, + Fortran::lower::SymMap &symTable, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OpenMPCancellationPointConstruct + &cancellationPointConstruct) { + TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct"); +} + +static void +genOMP(Fortran::lower::AbstractConverter &converter, + Fortran::lower::SymMap &symTable, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OpenMPStandaloneConstruct &standaloneConstruct) { + std::visit( + [&](auto &&s) { return genOMP(converter, symTable, semaCtx, eval, s); }, + standaloneConstruct.u); +} + //===----------------------------------------------------------------------===// // OpenMPConstruct visitors //===----------------------------------------------------------------------===// @@ -1774,7 +2090,7 @@ genOMP(Fortran::lower::AbstractConverter &converter, break; case llvm::omp::Directive::OMPD_target: genTargetOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation, - beginClauseList, directive.v); + beginClauseList); break; case llvm::omp::Directive::OMPD_target_data: genTargetDataOp(converter, semaCtx, eval, /*genNested=*/true, @@ -1790,8 +2106,7 @@ genOMP(Fortran::lower::AbstractConverter &converter, break; case llvm::omp::Directive::OMPD_teams: genTeamsOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation, - beginClauseList, - /*outerCombined=*/false); + beginClauseList); break; case llvm::omp::Directive::OMPD_workshare: // FIXME: Workshare is not a commonly used OpenMP construct, an @@ -1813,8 +2128,7 @@ genOMP(Fortran::lower::AbstractConverter &converter, if ((llvm::omp::allTargetSet & llvm::omp::blockConstructSet) .test(directive.v)) { genTargetOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation, - beginClauseList, directive.v, - /*outerCombined=*/true); + beginClauseList, /*outerCombined=*/true); combinedDirective = true; } if ((llvm::omp::allTeamsSet & llvm::omp::blockConstructSet) @@ -1851,44 +2165,13 @@ genOMP(Fortran::lower::AbstractConverter &converter, Fortran::semantics::SemanticsContext &semaCtx, Fortran::lower::pft::Evaluation &eval, const Fortran::parser::OpenMPCriticalConstruct &criticalConstruct) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - mlir::Location currentLocation = converter.getCurrentLocation(); - std::string name; - const Fortran::parser::OmpCriticalDirective &cd = + const auto &cd = std::get(criticalConstruct.t); - if (std::get>(cd.t).has_value()) { - name = - std::get>(cd.t).value().ToString(); - } - - mlir::omp::CriticalOp criticalOp = [&]() { - if (name.empty()) { - return firOpBuilder.create( - currentLocation, mlir::FlatSymbolRefAttr()); - } - - mlir::ModuleOp module = firOpBuilder.getModule(); - mlir::OpBuilder modBuilder(module.getBodyRegion()); - auto global = module.lookupSymbol(name); - if (!global) { - mlir::omp::CriticalClauseOps clauseOps; - const auto &clauseList = std::get(cd.t); - - ClauseProcessor cp(converter, semaCtx, clauseList); - cp.processHint(clauseOps); - clauseOps.nameAttr = - mlir::StringAttr::get(firOpBuilder.getContext(), name); - - global = modBuilder.create(currentLocation, - clauseOps); - } - - return firOpBuilder.create( - currentLocation, mlir::FlatSymbolRefAttr::get(firOpBuilder.getContext(), - global.getSymName())); - }(); - auto genInfo = OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval); - createBodyOfOp(criticalOp, genInfo); + const auto &clauseList = std::get(cd.t); + const auto &name = std::get>(cd.t); + mlir::Location currentLocation = converter.getCurrentLocation(); + genCriticalOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation, + clauseList, name); } static void @@ -1907,7 +2190,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, const Fortran::parser::OpenMPLoopConstruct &loopConstruct) { const auto &beginLoopDirective = std::get(loopConstruct.t); - const auto &loopOpClauseList = + const auto &beginClauseList = std::get(beginLoopDirective.t); mlir::Location currentLocation = converter.genLocation(beginLoopDirective.source); @@ -1928,33 +2211,31 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, bool validDirective = false; if (llvm::omp::topTaskloopSet.test(ompDirective)) { validDirective = true; - TODO(currentLocation, "Taskloop construct"); + genTaskloopOp(converter, semaCtx, eval, currentLocation, beginClauseList); } else { // Create omp.{target, teams, distribute, parallel} nested operations if ((llvm::omp::allTargetSet & llvm::omp::loopConstructSet) .test(ompDirective)) { validDirective = true; genTargetOp(converter, semaCtx, eval, /*genNested=*/false, - currentLocation, loopOpClauseList, ompDirective, - /*outerCombined=*/true); + currentLocation, beginClauseList, /*outerCombined=*/true); } if ((llvm::omp::allTeamsSet & llvm::omp::loopConstructSet) .test(ompDirective)) { validDirective = true; genTeamsOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation, - loopOpClauseList, - /*outerCombined=*/true); + beginClauseList, /*outerCombined=*/true); } if (llvm::omp::allDistributeSet.test(ompDirective)) { validDirective = true; - TODO(currentLocation, "Distribute construct"); + genDistributeOp(converter, semaCtx, eval, /*genNested=*/false, + currentLocation, beginClauseList); } if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet) .test(ompDirective)) { validDirective = true; genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false, - currentLocation, loopOpClauseList, - /*outerCombined=*/true); + currentLocation, beginClauseList, /*outerCombined=*/true); } } if ((llvm::omp::allDoSet | llvm::omp::allSimdSet).test(ompDirective)) @@ -1968,16 +2249,14 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, if (llvm::omp::allDoSimdSet.test(ompDirective)) { // 2.9.3.2 Workshare SIMD construct - createSimdWsloop(converter, semaCtx, eval, ompDirective, loopOpClauseList, - endClauseList, currentLocation); - + genCompositeDoSimd(converter, semaCtx, eval, ompDirective, beginClauseList, + endClauseList, currentLocation); } else if (llvm::omp::allSimdSet.test(ompDirective)) { // 2.9.3.1 SIMD construct - createSimdLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList, - currentLocation); + genSimdLoopOp(converter, semaCtx, eval, currentLocation, beginClauseList); } else { - createWsloop(converter, semaCtx, eval, ompDirective, loopOpClauseList, - endClauseList, currentLocation); + genWsloopOp(converter, semaCtx, eval, currentLocation, beginClauseList, + endClauseList); } } @@ -1997,44 +2276,39 @@ genOMP(Fortran::lower::AbstractConverter &converter, Fortran::semantics::SemanticsContext &semaCtx, Fortran::lower::pft::Evaluation &eval, const Fortran::parser::OpenMPSectionsConstruct §ionsConstruct) { - mlir::Location currentLocation = converter.getCurrentLocation(); - mlir::omp::SectionsClauseOps clauseOps; const auto &beginSectionsDirective = std::get(sectionsConstruct.t); - const auto §ionsClauseList = + const auto &beginClauseList = std::get(beginSectionsDirective.t); // Process clauses before optional omp.parallel, so that new variables are // allocated outside of the parallel region - ClauseProcessor cp(converter, semaCtx, sectionsClauseList); - cp.processSectionsReduction(currentLocation, clauseOps); - cp.processAllocate(clauseOps); - // TODO Support delayed privatization. + mlir::Location currentLocation = converter.getCurrentLocation(); + mlir::omp::SectionsClauseOps clauseOps; + genSectionsClauses(converter, semaCtx, beginClauseList, currentLocation, + /*clausesFromBeginSections=*/true, clauseOps); + // Parallel wrapper of PARALLEL SECTIONS construct llvm::omp::Directive dir = std::get(beginSectionsDirective.t) .v; - - // Parallel wrapper of PARALLEL SECTIONS construct if (dir == llvm::omp::Directive::OMPD_parallel_sections) { genParallelOp(converter, symTable, semaCtx, eval, - /*genNested=*/false, currentLocation, sectionsClauseList, + /*genNested=*/false, currentLocation, beginClauseList, /*outerCombined=*/true); } else { const auto &endSectionsDirective = std::get(sectionsConstruct.t); - const auto &endSectionsClauseList = + const auto &endClauseList = std::get(endSectionsDirective.t); - ClauseProcessor(converter, semaCtx, endSectionsClauseList) - .processNowait(clauseOps); + genSectionsClauses(converter, semaCtx, endClauseList, currentLocation, + /*clausesFromBeginSections=*/false, clauseOps); } - // SECTIONS construct - genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval) - .setGenNested(false), - clauseOps); + // SECTIONS construct. + genSectionsOp(converter, semaCtx, eval, currentLocation, clauseOps); + // Generate nested SECTION operations recursively. const auto §ionBlocks = std::get(sectionsConstruct.t); auto &firOpBuilder = converter.getFirOpBuilder(); @@ -2043,40 +2317,12 @@ genOMP(Fortran::lower::AbstractConverter &converter, llvm::zip(sectionBlocks.v, eval.getNestedEvaluations())) { symTable.pushScope(); genSectionOp(converter, semaCtx, neval, /*genNested=*/true, currentLocation, - sectionsClauseList); + beginClauseList); symTable.popScope(); firOpBuilder.restoreInsertionPoint(ip); } } -static void -genOMP(Fortran::lower::AbstractConverter &converter, - Fortran::lower::SymMap &symTable, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, - const Fortran::parser::OpenMPStandaloneConstruct &standaloneConstruct) { - std::visit( - Fortran::common::visitors{ - [&](const Fortran::parser::OpenMPSimpleStandaloneConstruct - &simpleStandaloneConstruct) { - genOmpSimpleStandalone(converter, semaCtx, eval, - /*genNested=*/true, - simpleStandaloneConstruct); - }, - [&](const Fortran::parser::OpenMPFlushConstruct &flushConstruct) { - genOmpFlush(converter, semaCtx, eval, flushConstruct); - }, - [&](const Fortran::parser::OpenMPCancelConstruct &cancelConstruct) { - TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct"); - }, - [&](const Fortran::parser::OpenMPCancellationPointConstruct - &cancellationPointConstruct) { - TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct"); - }, - }, - standaloneConstruct.u); -} - static void genOMP(Fortran::lower::AbstractConverter &converter, Fortran::lower::SymMap &symTable, Fortran::semantics::SemanticsContext &semaCtx, diff --git a/flang/test/Lower/OpenMP/FIR/target.f90 b/flang/test/Lower/OpenMP/FIR/target.f90 index 022327f9c25daf..ca3162340d7846 100644 --- a/flang/test/Lower/OpenMP/FIR/target.f90 +++ b/flang/test/Lower/OpenMP/FIR/target.f90 @@ -411,8 +411,8 @@ end subroutine omp_target_implicit_bounds !CHECK-LABEL: func.func @_QPomp_target_thread_limit() { subroutine omp_target_thread_limit integer :: a - !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32 !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "a"} + !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32 !CHECK: omp.target thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref) { !CHECK: ^bb0(%[[ARG_0]]: !fir.ref): !$omp target map(tofrom: a) thread_limit(64) diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index 6f72b5a34d069a..51b66327dfb24b 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -490,8 +490,8 @@ end subroutine omp_target_implicit_bounds !CHECK-LABEL: func.func @_QPomp_target_thread_limit() { subroutine omp_target_thread_limit integer :: a - !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32 !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(tofrom) capture(ByRef) -> !fir.ref {name = "a"} + !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32 !CHECK: omp.target thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %{{.*}} : !fir.ref) { !CHECK: ^bb0(%{{.*}}: !fir.ref): !$omp target map(tofrom: a) thread_limit(64) diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 index 33b5971656010a..d849dd206b9439 100644 --- a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 +++ b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 @@ -21,7 +21,7 @@ subroutine only_use_device_ptr !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr() !CHECK: omp.target_data use_device_ptr({{.*}} : !fir.ref>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref>>>, !fir.ref>>>) { -!CHECK: ^bb0(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>>>): +!CHECK: ^bb0(%{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>>>): subroutine mix_use_device_ptr_and_addr use iso_c_binding integer, pointer, dimension(:) :: array @@ -47,7 +47,7 @@ subroutine only_use_device_addr !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map() !CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref, !fir.ref) use_device_ptr(%{{.*}} : !fir.ref>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref>>>, !fir.ref>>>) { -!CHECK: ^bb0(%{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>>>): +!CHECK: ^bb0(%{{.*}}: !fir.ref>>>, %{{.*}}: !fir.ref>, %{{.*}}: !fir.ref>>>): subroutine mix_use_device_ptr_and_addr_and_map use iso_c_binding integer :: i, j From 31424be3aef4290dd84065b9371fcd0c5014e097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Mon, 15 Apr 2024 13:14:41 +0200 Subject: [PATCH 069/300] [clang][Interp][NFC] Compare std::optionals directly --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 93059edc4622f8..4a7b40440770e3 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -1624,7 +1624,7 @@ bool ByteCodeExprGen::VisitCompoundAssignOperator( return false; if (!this->emitLoad(*LT, E)) return false; - if (*LT != *LHSComputationT) { + if (LT != LHSComputationT) { if (!this->emitCast(*LT, *LHSComputationT, E)) return false; } @@ -1680,7 +1680,7 @@ bool ByteCodeExprGen::VisitCompoundAssignOperator( } // And now cast from LHSComputationT to ResultT. - if (*ResultT != *LHSComputationT) { + if (ResultT != LHSComputationT) { if (!this->emitCast(*LHSComputationT, *ResultT, E)) return false; } From a831c54357c2bb7b8b457ccea22836c23e8b8625 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Mon, 15 Apr 2024 14:17:31 +0200 Subject: [PATCH 070/300] [clang][Interp] Avoid calling invalid functions Check if the non-null function pointer is even valid before calling the function. --- clang/lib/AST/Interp/FunctionPointer.h | 1 + clang/lib/AST/Interp/Interp.h | 4 ++++ clang/test/AST/Interp/functions.cpp | 15 +++++++++++++-- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/FunctionPointer.h b/clang/lib/AST/Interp/FunctionPointer.h index c2ea295b82bdf5..fc3d7a4214a72b 100644 --- a/clang/lib/AST/Interp/FunctionPointer.h +++ b/clang/lib/AST/Interp/FunctionPointer.h @@ -32,6 +32,7 @@ class FunctionPointer final { const Function *getFunction() const { return Func; } bool isZero() const { return !Func; } + bool isValid() const { return Valid; } bool isWeak() const { if (!Func || !Valid) return false; diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index 4182254357eb9a..dd0bacd73acb10 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -2236,6 +2236,10 @@ inline bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize, << const_cast(E) << E->getSourceRange(); return false; } + + if (!FuncPtr.isValid()) + return false; + assert(F); // Check argument nullability state. diff --git a/clang/test/AST/Interp/functions.cpp b/clang/test/AST/Interp/functions.cpp index 4fb3c816000ab8..f9bb5d53634e0b 100644 --- a/clang/test/AST/Interp/functions.cpp +++ b/clang/test/AST/Interp/functions.cpp @@ -584,9 +584,20 @@ namespace VariadicOperator { namespace WeakCompare { [[gnu::weak]]void weak_method(); static_assert(weak_method != nullptr, ""); // both-error {{not an integral constant expression}} \ - // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}} + // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}} constexpr auto A = &weak_method; static_assert(A != nullptr, ""); // both-error {{not an integral constant expression}} \ - // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}} + // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}} +} + +namespace FromIntegral { +#if __cplusplus >= 202002L + typedef double (*DoubleFn)(); + int a[(int)DoubleFn((void*)-1)()]; // both-error {{not allowed at file scope}} \ + // both-warning {{variable length arrays}} + int b[(int)DoubleFn((void*)(-1 + 1))()]; // both-error {{not allowed at file scope}} \ + // expected-note {{evaluates to a null function pointer}} \ + // both-warning {{variable length arrays}} +#endif } From b0194d2894db49d7cf4d36aed87952c3e0c6a390 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Tue, 16 Apr 2024 18:19:50 +0800 Subject: [PATCH 071/300] [SEH] Ignore async exception flag when the environment is not MSVC (#88101) Fixes #62449 --- clang/lib/Driver/ToolChains/Clang.cpp | 28 +++++++++++++------ .../test/Driver/windows-seh-async-verify.cpp | 24 ++++++++++++++++ 2 files changed, 44 insertions(+), 8 deletions(-) create mode 100644 clang/test/Driver/windows-seh-async-verify.cpp diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 6d52eced104296..096ed14f957046 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -346,11 +346,14 @@ static bool addExceptionArgs(const ArgList &Args, types::ID InputType, bool EH = Args.hasFlag(options::OPT_fexceptions, options::OPT_fno_exceptions, false); - bool EHa = Args.hasFlag(options::OPT_fasync_exceptions, - options::OPT_fno_async_exceptions, false); - if (EHa) { - CmdArgs.push_back("-fasync-exceptions"); - EH = true; + // Async exceptions are Windows MSVC only. + if (Triple.isWindowsMSVCEnvironment()) { + bool EHa = Args.hasFlag(options::OPT_fasync_exceptions, + options::OPT_fno_async_exceptions, false); + if (EHa) { + CmdArgs.push_back("-fasync-exceptions"); + EH = true; + } } // Obj-C exceptions are enabled by default, regardless of -fexceptions. This @@ -8102,7 +8105,8 @@ struct EHFlags { /// The 'a' modifier is unimplemented and fundamentally hard in LLVM IR. /// - c: Assume that extern "C" functions are implicitly nounwind. /// The default is /EHs-c-, meaning cleanups are disabled. -static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args) { +static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args, + bool isWindowsMSVC) { EHFlags EH; std::vector EHArgs = @@ -8112,8 +8116,15 @@ static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args) { switch (EHVal[I]) { case 'a': EH.Asynch = maybeConsumeDash(EHVal, I); - if (EH.Asynch) + if (EH.Asynch) { + // Async exceptions are Windows MSVC only. + if (!isWindowsMSVC) { + EH.Asynch = false; + D.Diag(clang::diag::warn_drv_unused_argument) << "/EHa" << EHVal; + continue; + } EH.Synch = false; + } continue; case 'c': EH.NoUnwindC = maybeConsumeDash(EHVal, I); @@ -8177,7 +8188,8 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType, const Driver &D = getToolChain().getDriver(); - EHFlags EH = parseClangCLEHFlags(D, Args); + bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment(); + EHFlags EH = parseClangCLEHFlags(D, Args, IsWindowsMSVC); if (!isNVPTX && (EH.Synch || EH.Asynch)) { if (types::isCXX(InputType)) CmdArgs.push_back("-fcxx-exceptions"); diff --git a/clang/test/Driver/windows-seh-async-verify.cpp b/clang/test/Driver/windows-seh-async-verify.cpp new file mode 100644 index 00000000000000..5fda6a77dba049 --- /dev/null +++ b/clang/test/Driver/windows-seh-async-verify.cpp @@ -0,0 +1,24 @@ +// RUN: %clang --target=x86_64-pc-windows -fasync-exceptions -fsyntax-only %s -### 2>&1 | FileCheck %s +// RUN: %clang_cl --target=x86_64-pc-windows /EHa -fsyntax-only %s -### 2>&1 | FileCheck %s +// RUN: %clang --target=x86_64-pc-windows-gnu -fasync-exceptions -fsyntax-only %s -### 2>&1 | FileCheck %s --check-prefixes=GNU-ALL,GNU +// RUN: %clang_cl --target=x86_64-pc-windows-gnu /EHa -fsyntax-only %s -### 2>&1 | FileCheck %s --check-prefixes=GNU-ALL,CL-GNU + +// CHECK-NOT: warning +// GNU: warning: argument unused during compilation: '-fasync-exceptions' [-Wunused-command-line-argument] +// CL-GNU: warning: argument unused during compilation: '/EHa' [-Wunused-command-line-argument] + +// CHECK: -fasync-exceptions +// GNU-ALL-NOT: -fasync-exceptions +struct S { + union _Un { + ~_Un() {} + char _Buf[12]; + }; + _Un _un; +}; + +struct Embed { + S v2; +}; + +void PR62449() { Embed v{}; } From f4960da6023b8034ae68925c3223d51624621b37 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Tue, 16 Apr 2024 11:24:45 +0100 Subject: [PATCH 072/300] Revert "[Verifier] Reject va_start in non-variadic function (#88809)" This reverts commit 61717c1aa1f08eb57839a21fb2d9004739022e0d. Failed a MLIR test --- llvm/lib/Analysis/Lint.cpp | 5 ++++- llvm/lib/IR/Verifier.cpp | 5 ----- llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll | 2 +- llvm/test/Other/lint.ll | 7 +++++++ llvm/test/Verifier/variadic.ll | 8 -------- 5 files changed, 12 insertions(+), 15 deletions(-) delete mode 100644 llvm/test/Verifier/variadic.ll diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 1ab856ac8830a9..0694c2995dfcce 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -350,7 +350,10 @@ void Lint::visitCallBase(CallBase &I) { } case Intrinsic::vastart: - // vastart in non-varargs function is rejected by the verifier + Check(I.getParent()->getParent()->isVarArg(), + "Undefined behavior: va_start called in a non-varargs function", + &I); + visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI), std::nullopt, nullptr, MemRef::Read | MemRef::Write); break; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 4cd61e6e531bff..516d4a0515569b 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5798,11 +5798,6 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { break; } - case Intrinsic::vastart: { - Check(Call.getFunction()->isVarArg(), - "va_start called in a non-varargs function"); - break; - } case Intrinsic::vector_reduce_and: case Intrinsic::vector_reduce_or: case Intrinsic::vector_reduce_xor: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll b/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll index 8c6e01d934c2d8..bd576d0f70e9c1 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll @@ -3,7 +3,7 @@ declare void @llvm.va_start(ptr) -define void @test_va_start(ptr %list, ...) { +define void @test_va_start(ptr %list) { ; CHECK-LABEL: name: test_va_start ; CHECK: [[LIST:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK-IOS: G_VASTART [[LIST]](p0) :: (store (s64) into %ir.list, align 1) diff --git a/llvm/test/Other/lint.ll b/llvm/test/Other/lint.ll index 6fd2d40cd2f298..6b31b31a78c98a 100644 --- a/llvm/test/Other/lint.ll +++ b/llvm/test/Other/lint.ll @@ -124,6 +124,13 @@ define void @0() nounwind { ret void } +; CHECK: va_start called in a non-varargs function +declare void @llvm.va_start(ptr) +define void @not_vararg(ptr %p) nounwind { + call void @llvm.va_start(ptr %p) + ret void +} + ; CHECK: Undefined behavior: Branch to non-blockaddress define void @use_indbr() { indirectbr ptr @foo, [label %block] diff --git a/llvm/test/Verifier/variadic.ll b/llvm/test/Verifier/variadic.ll deleted file mode 100644 index 55e4a4da0a9203..00000000000000 --- a/llvm/test/Verifier/variadic.ll +++ /dev/null @@ -1,8 +0,0 @@ -; RUN: not opt -S -passes=verify 2>&1 < %s | FileCheck %s - -; CHECK: va_start called in a non-varargs function -declare void @llvm.va_start(ptr) -define void @not_vararg(ptr %p) nounwind { - call void @llvm.va_start(ptr %p) - ret void -} From 01f79899ba349a0200586c8d05f5e22cca2ced31 Mon Sep 17 00:00:00 2001 From: Jinyang He Date: Tue, 16 Apr 2024 18:31:03 +0800 Subject: [PATCH 073/300] [LoongArch] Use R_LARCH_ALIGN with section symbol (#84741) In LoongArch psABI v2.30, the R_LARCH_ALIGN requires symbol index to support the third parameter of alignment directive. Create symbol for each section is redundant because they have section symbol which can also be used as symbol index. So use section symbol directly for R_LARCH_ALIGN. --- lld/ELF/InputSection.cpp | 6 +++- lld/test/ELF/loongarch-relax-align-ldr.s | 28 +++++++++++++++++++ lld/test/ELF/loongarch-relax-emit-relocs.s | 5 ++-- .../MCTargetDesc/LoongArchAsmBackend.cpp | 7 ++--- .../MC/LoongArch/Relocations/relax-addsub.s | 2 +- .../MC/LoongArch/Relocations/relax-align.s | 14 ++++++---- 6 files changed, 47 insertions(+), 15 deletions(-) create mode 100644 lld/test/ELF/loongarch-relax-align-ldr.s diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index c8350652e65a67..fa48552b8f7a12 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -464,7 +464,11 @@ void InputSection::copyRelocations(uint8_t *buf, addend += sec->getFile()->mipsGp0; } - if (RelTy::IsRela) + if (config->emachine == EM_LOONGARCH && type == R_LARCH_ALIGN) + // LoongArch psABI v2.30, the R_LARCH_ALIGN requires symbol index. + // If it use the section symbol, the addend should not be changed. + p->r_addend = addend; + else if (RelTy::IsRela) p->r_addend = sym.getVA(addend) - section->getOutputSection()->addr; // For SHF_ALLOC sections relocated by REL, append a relocation to // sec->relocations so that relocateAlloc transitively called by diff --git a/lld/test/ELF/loongarch-relax-align-ldr.s b/lld/test/ELF/loongarch-relax-align-ldr.s new file mode 100644 index 00000000000000..6534dc906cfd02 --- /dev/null +++ b/lld/test/ELF/loongarch-relax-align-ldr.s @@ -0,0 +1,28 @@ +# REQUIRES: loongarch +## Test `ld -r` not changes the addend of R_LARCH_ALIGN. + +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.64.o +# RUN: ld.lld -r %t.64.o %t.64.o -o %t.64.r +# RUN: llvm-objdump -dr --no-show-raw-insn %t.64.r | FileCheck %s + +# CHECK: <.text>: +# CHECK-NEXT: break 1 +# CHECK-NEXT: nop +# CHECK-NEXT: {{0*}}04: R_LARCH_ALIGN .text+0x804 +# CHECK-NEXT: nop +# CHECK-NEXT: nop +# CHECK-NEXT: break 2 +# CHECK-NEXT: break 0 +# CHECK-NEXT: break 0 +# CHECK-NEXT: break 0 +# CHECK-NEXT: break 1 +# CHECK-NEXT: nop +# CHECK-NEXT: {{0*}}24: R_LARCH_ALIGN .text+0x804 +# CHECK-NEXT: nop +# CHECK-NEXT: nop +# CHECK-NEXT: break 2 + +.text +break 1 +.p2align 4, , 8 +break 2 diff --git a/lld/test/ELF/loongarch-relax-emit-relocs.s b/lld/test/ELF/loongarch-relax-emit-relocs.s index 581fce8c95caa4..9007f8fcc114f0 100644 --- a/lld/test/ELF/loongarch-relax-emit-relocs.s +++ b/lld/test/ELF/loongarch-relax-emit-relocs.s @@ -25,7 +25,7 @@ # CHECK-NEXT: R_LARCH_PCALA_LO12 _start # CHECK-NEXT: R_LARCH_RELAX *ABS* # CHECK-NEXT: nop -# CHECK-NEXT: R_LARCH_ALIGN .Lla-relax-align0+0x4 +# CHECK-NEXT: R_LARCH_ALIGN .text+0x4 # CHECK-NEXT: nop # CHECK-NEXT: ret @@ -37,11 +37,12 @@ # CHECKR-NEXT: R_LARCH_PCALA_LO12 _start # CHECKR-NEXT: R_LARCH_RELAX *ABS* # CHECKR-NEXT: nop -# CHECKR-NEXT: R_LARCH_ALIGN .Lla-relax-align0+0x4 +# CHECKR-NEXT: R_LARCH_ALIGN .text+0x4 # CHECKR-NEXT: nop # CHECKR-NEXT: nop # CHECKR-NEXT: ret +.text .global _start _start: la.pcrel $a0, _start diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index de492f2b1f0a4f..98f5014a34b1de 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -226,11 +226,8 @@ bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign( MCFixup::create(0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_align)); const MCSymbolRefExpr *MCSym = getSecToAlignSym()[Sec]; if (MCSym == nullptr) { - // Create a symbol and make the value of symbol is zero. - MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align"); - Sym->setFragment(&*Sec->getBeginSymbol()->getFragment()); - Asm.registerSymbol(*Sym); - MCSym = MCSymbolRefExpr::create(Sym, Ctx); + // Use section symbol directly. + MCSym = MCSymbolRefExpr::create(Sec->getBeginSymbol(), Ctx); getSecToAlignSym()[Sec] = MCSym; } diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s index 18e0ede5e29375..0e27d6301bb3cd 100644 --- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s +++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s @@ -28,7 +28,7 @@ # RELAX: Relocations [ # RELAX-NEXT: Section ({{.*}}) .rela.text { -# RELAX-NEXT: 0x4 R_LARCH_ALIGN {{.*}} 0x4 +# RELAX-NEXT: 0x4 R_LARCH_ALIGN .text 0x4 # RELAX-NEXT: 0x10 R_LARCH_PCALA_HI20 .L1 0x0 # RELAX-NEXT: 0x10 R_LARCH_RELAX - 0x0 # RELAX-NEXT: 0x14 R_LARCH_PCALA_LO12 .L1 0x0 diff --git a/llvm/test/MC/LoongArch/Relocations/relax-align.s b/llvm/test/MC/LoongArch/Relocations/relax-align.s index 294fd9fb916c75..0246d5b46431c9 100644 --- a/llvm/test/MC/LoongArch/Relocations/relax-align.s +++ b/llvm/test/MC/LoongArch/Relocations/relax-align.s @@ -63,17 +63,19 @@ ret ## Test the symbol index is different from .text. .section .text2, "ax" .p2align 4 +.p2align 4, , 4 break 7 # RELOC: Relocations [ # RELAX-RELOC-NEXT: Section ({{.*}}) .rela.text { -# RELAX-RELOC-NEXT: 0x24 R_LARCH_ALIGN .Lla-relax-align0 0x4 -# RELAX-RELOC-NEXT: 0x34 R_LARCH_ALIGN .Lla-relax-align0 0x5 -# RELAX-RELOC-NEXT: 0x50 R_LARCH_ALIGN .Lla-relax-align0 0x4 -# RELAX-RELOC-NEXT: 0x60 R_LARCH_ALIGN .Lla-relax-align0 0xB04 -# RELAX-RELOC-NEXT: 0x70 R_LARCH_ALIGN .Lla-relax-align0 0x4 +# RELAX-RELOC-NEXT: 0x24 R_LARCH_ALIGN .text 0x4 +# RELAX-RELOC-NEXT: 0x34 R_LARCH_ALIGN .text 0x5 +# RELAX-RELOC-NEXT: 0x50 R_LARCH_ALIGN .text 0x4 +# RELAX-RELOC-NEXT: 0x60 R_LARCH_ALIGN .text 0xB04 +# RELAX-RELOC-NEXT: 0x70 R_LARCH_ALIGN .text 0x4 # RELAX-RELOC-NEXT: } # RELAX-RELOC-NEXT: Section ({{.*}}) .rela.text2 { -# RELAX-RELOC-NEXT: 0x0 R_LARCH_ALIGN .Lla-relax-align1 0x4 +# RELAX-RELOC-NEXT: 0x0 R_LARCH_ALIGN .text2 0x4 +# RELAX-RELOC-NEXT: 0xC R_LARCH_ALIGN .text2 0x404 # RELAX-RELOC-NEXT: } # RELOC-NEXT: ] From c09384e2b419c7b4e4167e0d0295d9018cc6169c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Mon, 15 Apr 2024 15:11:23 +0200 Subject: [PATCH 074/300] [clang][Interp] Support MemberExprs pointing to VarDecls --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 11 +++++++++-- clang/test/AST/Interp/records.cpp | 8 ++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 4a7b40440770e3..a069f3ec27e721 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -1267,10 +1267,19 @@ template bool ByteCodeExprGen::VisitMemberExpr(const MemberExpr *E) { // 'Base.Member' const Expr *Base = E->getBase(); + const ValueDecl *Member = E->getMemberDecl(); if (DiscardResult) return this->discard(Base); + if (const auto *VD = dyn_cast(Member)) { + // I am almost confident in saying that a var decl must be static + // and therefore registered as a global variable. But this will probably + // turn out to be wrong some time in the future, as always. + if (auto GlobalIndex = P.getGlobal(VD)) + return this->emitGetPtrGlobal(*GlobalIndex, E); + } + if (Initializing) { if (!this->delegate(Base)) return false; @@ -1280,8 +1289,6 @@ bool ByteCodeExprGen::VisitMemberExpr(const MemberExpr *E) { } // Base above gives us a pointer on the stack. - // TODO: Implement non-FieldDecl members. - const ValueDecl *Member = E->getMemberDecl(); if (const auto *FD = dyn_cast(Member)) { const RecordDecl *RD = FD->getParent(); const Record *R = getRecord(RD); diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp index f251497ed70182..2c33fa1bf88432 100644 --- a/clang/test/AST/Interp/records.cpp +++ b/clang/test/AST/Interp/records.cpp @@ -1309,3 +1309,11 @@ namespace pr18633 { func2(); } } + +namespace { + struct F { + static constexpr int Z = 12; + }; + F f; + static_assert(f.Z == 12, ""); +} From 1120d8e6f799121b611aa23bdc128e40cf9c6c58 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Tue, 16 Apr 2024 13:37:29 +0300 Subject: [PATCH 075/300] [clang][CodeGen] Add AS for Globals to SPIR & SPIRV datalayouts (#88455) Currently neither the SPIR nor the SPIRV targets specify the AS for globals in their datalayout strings. This is problematic because CodeGen/LLVM will default to AS0 in this case, which produces Globals that end up in the private address space for e.g. OCL, HIPSPV or SYCL. This patch addresses it by completing the datalayout string. --- clang/lib/Basic/Targets/SPIR.h | 8 +++--- clang/test/CodeGen/target-data.c | 4 +-- llvm/lib/IR/AutoUpgrade.cpp | 9 ++++--- llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp | 4 +-- .../Bitcode/DataLayoutUpgradeTest.cpp | 27 +++++++++++++++++++ 5 files changed, 40 insertions(+), 12 deletions(-) diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index e25991e3dfe821..9a4a8b501460b6 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -259,7 +259,7 @@ class LLVM_LIBRARY_VISIBILITY SPIR32TargetInfo : public SPIRTargetInfo { SizeType = TargetInfo::UnsignedInt; PtrDiffType = IntPtrType = TargetInfo::SignedInt; resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-" - "v96:128-v192:256-v256:256-v512:512-v1024:1024"); + "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"); } void getTargetDefines(const LangOptions &Opts, @@ -276,7 +276,7 @@ class LLVM_LIBRARY_VISIBILITY SPIR64TargetInfo : public SPIRTargetInfo { SizeType = TargetInfo::UnsignedLong; PtrDiffType = IntPtrType = TargetInfo::SignedLong; resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-" - "v96:128-v192:256-v256:256-v512:512-v1024:1024"); + "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"); } void getTargetDefines(const LangOptions &Opts, @@ -336,7 +336,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRV32TargetInfo : public BaseSPIRVTargetInfo { SizeType = TargetInfo::UnsignedInt; PtrDiffType = IntPtrType = TargetInfo::SignedInt; resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-" - "v96:128-v192:256-v256:256-v512:512-v1024:1024"); + "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"); } void getTargetDefines(const LangOptions &Opts, @@ -357,7 +357,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64TargetInfo : public BaseSPIRVTargetInfo { SizeType = TargetInfo::UnsignedLong; PtrDiffType = IntPtrType = TargetInfo::SignedLong; resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-" - "v96:128-v192:256-v256:256-v512:512-v1024:1024"); + "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"); } void getTargetDefines(const LangOptions &Opts, diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c index acff367d50eb91..c184f314f68f80 100644 --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -251,11 +251,11 @@ // RUN: %clang_cc1 -triple spir-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=SPIR -// SPIR: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +// SPIR: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1" // RUN: %clang_cc1 -triple spir64-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=SPIR64 -// SPIR64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +// SPIR64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1" // RUN: %clang_cc1 -triple bpfel -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=BPFEL diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 2c480fb76ee4de..634b2dd5119e8d 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -5341,10 +5341,11 @@ MDNode *llvm::upgradeInstructionLoopAttachment(MDNode &N) { std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) { Triple T(TT); - // The only data layout upgrades needed for pre-GCN are setting the address - // space of globals to 1. - if (T.isAMDGPU() && !T.isAMDGCN() && !DL.contains("-G") && - !DL.starts_with("G")) { + // The only data layout upgrades needed for pre-GCN, SPIR or SPIRV are setting + // the address space of globals to 1. This does not apply to SPIRV Logical. + if (((T.isAMDGPU() && !T.isAMDGCN()) || + (T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical()))) && + !DL.contains("-G") && !DL.starts_with("G")) { return DL.empty() ? std::string("G1") : (DL + "-G1").str(); } diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index fbf64f2b1dfb13..ae8baa3f119132 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -55,9 +55,9 @@ static std::string computeDataLayout(const Triple &TT) { // mean anything. if (Arch == Triple::spirv32) return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-" - "v96:128-v192:256-v256:256-v512:512-v1024:1024"; + "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"; return "e-i64:64-v16:16-v24:32-v32:32-v48:64-" - "v96:128-v192:256-v256:256-v512:512-v1024:1024"; + "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"; } static Reloc::Model getEffectiveRelocModel(std::optional RM) { diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp index 4865616e3e2ba1..d7e4dba4ac1703 100644 --- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp +++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp @@ -59,6 +59,14 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) { EXPECT_EQ(UpgradeDataLayoutString("e-m:e-p:64:64-i64:64-i128:128-n64-S128", "riscv64"), "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"); + + // Check that SPIR && SPIRV targets add -G1 if it's not present. + EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir"), "e-p:32:32-G1"); + EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir64"), "e-p:32:32-G1"); + EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spirv32"), "e-p:32:32-G1"); + EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spirv64"), "e-p:32:32-G1"); + // but that SPIRV Logical does not. + EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spirv"), "e-p:32:32"); } TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) { @@ -100,6 +108,17 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) { "p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128-p9:192:256:256:32"); EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"), "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128-p9:192:256:256:32"); + + // Check that SPIR & SPIRV targets don't add -G1 if there is already a -G + // flag. + EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spir"), "e-p:32:32-G2"); + EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spir64"), "e-p:32:32-G2"); + EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spirv32"), "e-p:32:32-G2"); + EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spirv64"), "e-p:32:32-G2"); + EXPECT_EQ(UpgradeDataLayoutString("G2", "spir"), "G2"); + EXPECT_EQ(UpgradeDataLayoutString("G2", "spir64"), "G2"); + EXPECT_EQ(UpgradeDataLayoutString("G2", "spirv32"), "G2"); + EXPECT_EQ(UpgradeDataLayoutString("G2", "spirv64"), "G2"); } TEST(DataLayoutUpgradeTest, EmptyDataLayout) { @@ -113,6 +132,14 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) { EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "G1"); EXPECT_EQ(UpgradeDataLayoutString("", "amdgcn"), "G1-ni:7:8:9-p7:160:256:256:32-p8:128:128-p9:192:256:256:32"); + + // Check that SPIR & SPIRV targets add G1 if it's not present. + EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1"); + EXPECT_EQ(UpgradeDataLayoutString("", "spir64"), "G1"); + EXPECT_EQ(UpgradeDataLayoutString("", "spirv32"), "G1"); + EXPECT_EQ(UpgradeDataLayoutString("", "spirv64"), "G1"); + // but SPIRV Logical does not. + EXPECT_EQ(UpgradeDataLayoutString("", "spirv"), ""); } } // end namespace From 36b3c26451bf9a42f0b6b415993d3942bb73abdd Mon Sep 17 00:00:00 2001 From: XChy Date: Tue, 16 Apr 2024 18:51:03 +0800 Subject: [PATCH 076/300] [JumpThreading] Thread over BB with only an unconditional branch (#86312) Fixes #76609 This patch does: - relax the phis constraint in `CanRedirectPredsOfEmptyBBToSucc` - guarantee the BB has multiple different predecessors to redirect, so that we can handle the case without phis in BB. Without this change and phi constraint, we may redirect the CommonPred. The motivation is consistent with JumpThreading. We always want the branch to jump more direct to the destination, without passing the middle block. In this way, we can expose more other optimization opportunities. An obivous example proposed by @dtcxzyw is like: ```llvm define i32 @test(...) { entry: br i1 %c, label %do.end, label %if.then if.then: ; preds = %entry %call2 = call i32 @dummy() %tobool3.not = icmp eq i32 %call2, 0 br i1 %tobool3.not, label %do.end, label %return do.end: ; preds = %entry, %if.then br label %return return: ; preds = %if.then, %do.end %retval.0 = phi i32 [ 0, %do.end ], [ %call2, %if.then ] ret i32 %retval.0 } ``` `entry` can directly jump to return, without passing `do.end`, and then the if-else pattern can be simplified further: ```llvm define i32 @test(...) { entry: br i1 %c, label %return, label %if.then if.then: ; preds = %entry %call2 = call i32 @dummy() br label %return return: ; preds = %if.then %retval.0 = phi i32 [ 0, %entry ], [ %call2, %if.then ] ret i32 %retval.0 } ``` --- llvm/lib/Transforms/Utils/Local.cpp | 10 +- llvm/test/CodeGen/AArch64/and-sink.ll | 9 +- .../AArch64/combine-comparisons-by-cse.ll | 122 +++++++---------- llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll | 18 +-- llvm/test/Transforms/JumpThreading/pr79175.ll | 8 +- llvm/test/Transforms/JumpThreading/select.ll | 50 +++---- .../Transforms/JumpThreading/thread-prob-7.ll | 8 +- .../Transforms/JumpThreading/uncond-no-phi.ll | 123 ++++++++++++++++++ .../PhaseOrdering/thread-uncond-bb.ll | 62 +++++++++ 9 files changed, 284 insertions(+), 126 deletions(-) create mode 100644 llvm/test/Transforms/JumpThreading/uncond-no-phi.ll create mode 100644 llvm/test/Transforms/PhaseOrdering/thread-uncond-bb.ll diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index a42ef0c4e6ae9e..baec51a07fcbfc 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1019,12 +1019,14 @@ CanRedirectPredsOfEmptyBBToSucc(BasicBlock *BB, BasicBlock *Succ, const SmallPtrSetImpl &SuccPreds, BasicBlock *&CommonPred) { - // There must be phis in BB, otherwise BB will be merged into Succ directly - if (BB->phis().empty() || Succ->phis().empty()) + // When Succ has no phis, BB may be merged into Succ directly. We don't need + // to redirect the predecessors of BB in this case. + if (Succ->phis().empty()) return false; - // BB must have predecessors not shared that can be redirected to Succ - if (!BB->hasNPredecessorsOrMore(2)) + // BB must have multiple different predecessors, so that at least one of + // predecessors can be redirected to Succ, except the common predecessor. + if (BB->getUniquePredecessor() || pred_empty(BB)) return false; // Get single common predecessors of both BB and Succ diff --git a/llvm/test/CodeGen/AArch64/and-sink.ll b/llvm/test/CodeGen/AArch64/and-sink.ll index f298a55dab721e..a57e9d54f3078e 100644 --- a/llvm/test/CodeGen/AArch64/and-sink.ll +++ b/llvm/test/CodeGen/AArch64/and-sink.ll @@ -11,15 +11,14 @@ define dso_local i32 @and_sink1(i32 %a, i1 %c) { ; CHECK-LABEL: and_sink1: ; CHECK: // %bb.0: -; CHECK-NEXT: tbz w1, #0, .LBB0_3 +; CHECK-NEXT: tbz w1, #0, .LBB0_2 ; CHECK-NEXT: // %bb.1: // %bb0 +; CHECK-NEXT: tst w0, #0x4 ; CHECK-NEXT: adrp x8, A +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: str wzr, [x8, :lo12:A] -; CHECK-NEXT: tbnz w0, #2, .LBB0_3 -; CHECK-NEXT: // %bb.2: -; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB0_3: // %bb2 +; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll index 6449c3e11d6672..dde3e81833a63d 100644 --- a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll +++ b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll @@ -13,10 +13,10 @@ define i32 @combine_gt_ge_10() #0 { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, :got:a ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] -; CHECK-NEXT: ldr w8, [x8] -; CHECK-NEXT: cmp w8, #10 +; CHECK-NEXT: ldr w9, [x8] ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] +; CHECK-NEXT: cmp w9, #10 ; CHECK-NEXT: b.le .LBB0_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: adrp x9, :got:c @@ -29,18 +29,17 @@ define i32 @combine_gt_ge_10() #0 { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_3: // %lor.lhs.false -; CHECK-NEXT: b.lt .LBB0_6 +; CHECK-NEXT: cmp w9, #10 +; CHECK-NEXT: b.lt .LBB0_5 ; CHECK-NEXT: .LBB0_4: // %land.lhs.true3 ; CHECK-NEXT: adrp x9, :got:d ; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB0_6 -; CHECK-NEXT: // %bb.5: -; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB0_6: // %if.end +; CHECK-NEXT: .LBB0_5: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: @@ -145,10 +144,10 @@ define i32 @combine_lt_ge_5() #0 { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, :got:a ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] -; CHECK-NEXT: ldr w8, [x8] -; CHECK-NEXT: cmp w8, #5 +; CHECK-NEXT: ldr w9, [x8] ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] +; CHECK-NEXT: cmp w9, #5 ; CHECK-NEXT: b.ge .LBB2_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: adrp x9, :got:c @@ -161,18 +160,17 @@ define i32 @combine_lt_ge_5() #0 { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_3: // %lor.lhs.false -; CHECK-NEXT: b.gt .LBB2_6 +; CHECK-NEXT: cmp w9, #5 +; CHECK-NEXT: b.gt .LBB2_5 ; CHECK-NEXT: .LBB2_4: // %land.lhs.true3 ; CHECK-NEXT: adrp x9, :got:d ; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB2_6 -; CHECK-NEXT: // %bb.5: -; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_6: // %if.end +; CHECK-NEXT: .LBB2_5: ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: @@ -499,24 +497,17 @@ define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 { ; CHECK-NEXT: // %bb.3: // %while.cond.while.end_crit_edge ; CHECK-NEXT: ldr w8, [x19] ; CHECK-NEXT: .LBB7_4: // %while.end -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: b.gt .LBB7_7 -; CHECK-NEXT: // %bb.5: // %land.lhs.true -; CHECK-NEXT: adrp x8, :got:b -; CHECK-NEXT: adrp x9, :got:d -; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] -; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] -; CHECK-NEXT: ldr w8, [x8] -; CHECK-NEXT: ldr w9, [x9] -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB7_7 -; CHECK-NEXT: // %bb.6: -; CHECK-NEXT: mov w0, #123 // =0x7b -; CHECK-NEXT: b .LBB7_8 -; CHECK-NEXT: .LBB7_7: // %if.end -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: .LBB7_8: // %return +; CHECK-NEXT: adrp x9, :got:b +; CHECK-NEXT: adrp x10, :got:d +; CHECK-NEXT: ldr x9, [x9, :got_lo12:b] +; CHECK-NEXT: ldr x10, [x10, :got_lo12:d] ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr w9, [x9] +; CHECK-NEXT: ldr w10, [x10] +; CHECK-NEXT: cmp w9, w10 +; CHECK-NEXT: ccmp w8, #2, #0, eq +; CHECK-NEXT: mov w8, #123 // =0x7b +; CHECK-NEXT: csel w0, w8, wzr, lt ; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w19 @@ -564,52 +555,42 @@ return: ; preds = %if.end, %land.lhs.t define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 { ; CHECK-LABEL: do_nothing_if_compares_can_not_be_adjusted_to_each_other: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: .cfi_remember_state ; CHECK-NEXT: adrp x8, :got:a ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: b.gt .LBB8_3 +; CHECK-NEXT: b.gt .LBB8_4 ; CHECK-NEXT: // %bb.1: // %while.body.preheader +; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: sub w19, w8, #1 ; CHECK-NEXT: .LBB8_2: // %while.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: bl do_something ; CHECK-NEXT: adds w19, w19, #1 ; CHECK-NEXT: b.mi .LBB8_2 -; CHECK-NEXT: .LBB8_3: // %while.end -; CHECK-NEXT: adrp x8, :got:c -; CHECK-NEXT: ldr x8, [x8, :got_lo12:c] -; CHECK-NEXT: ldr w8, [x8] -; CHECK-NEXT: cmn w8, #2 -; CHECK-NEXT: b.lt .LBB8_6 -; CHECK-NEXT: // %bb.4: // %land.lhs.true +; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .LBB8_4: // %while.end ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d +; CHECK-NEXT: adrp x10, :got:c ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] ; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] +; CHECK-NEXT: ldr x10, [x10, :got_lo12:c] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] +; CHECK-NEXT: ldr w10, [x10] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB8_6 -; CHECK-NEXT: // %bb.5: -; CHECK-NEXT: mov w0, #123 // =0x7b -; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w19 -; CHECK-NEXT: .cfi_restore w30 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB8_6: // %if.end -; CHECK-NEXT: .cfi_restore_state -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w19 -; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: mov w8, #-3 // =0xfffffffd +; CHECK-NEXT: ccmp w10, w8, #4, eq +; CHECK-NEXT: mov w8, #123 // =0x7b +; CHECK-NEXT: csel w0, w8, wzr, gt ; CHECK-NEXT: ret entry: %0 = load i32, ptr @a, align 4 @@ -782,12 +763,14 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 { ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: csel x9, x0, xzr, gt ; CHECK-NEXT: str x9, [x1] -; CHECK-NEXT: b.le .LBB11_2 +; CHECK-NEXT: b.le .LBB11_3 ; CHECK-NEXT: // %bb.1: // %lor.lhs.false ; CHECK-NEXT: cmp w8, #2 -; CHECK-NEXT: b.ge .LBB11_4 -; CHECK-NEXT: b .LBB11_6 -; CHECK-NEXT: .LBB11_2: // %land.lhs.true +; CHECK-NEXT: b.ge .LBB11_5 +; CHECK-NEXT: // %bb.2: +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB11_3: // %land.lhs.true ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:c ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] @@ -795,11 +778,11 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB11_4 -; CHECK-NEXT: // %bb.3: +; CHECK-NEXT: b.ne .LBB11_5 +; CHECK-NEXT: // %bb.4: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB11_4: // %land.lhs.true3 +; CHECK-NEXT: .LBB11_5: // %land.lhs.true3 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] @@ -807,12 +790,7 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB11_6 -; CHECK-NEXT: // %bb.5: -; CHECK-NEXT: mov w0, #1 // =0x1 -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB11_6: // %if.end -; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret entry: %0 = load i32, ptr @a, align 4 diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll index dddc4bd953d7ac..c33c81841be65e 100644 --- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll +++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll @@ -10,12 +10,13 @@ define i32 @fred(ptr %a0) #0 { ; CHECK-LABEL: fred: ; CHECK: // %bb.0: // %b0 ; CHECK-NEXT: { -; CHECK-NEXT: if (p0) jump:nt .LBB0_2 +; CHECK-NEXT: r1:0 = combine(r0,#0) +; CHECK-NEXT: if (p0) jumpr r31 ; CHECK-NEXT: } -; CHECK-NEXT: // %bb.1: // %b2 +; CHECK-NEXT: .LBB0_1: // %b2 ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#0,#0) -; CHECK-NEXT: r1:0 = memd(r0+#0) +; CHECK-NEXT: r1:0 = memd(r1+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: p0 = vcmph.eq(r1:0,r3:2) @@ -27,16 +28,7 @@ define i32 @fred(ptr %a0) #0 { ; CHECK-NEXT: r0 = and(r0,#1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: p0 = cmp.eq(r0,#11) -; CHECK-NEXT: r0 = #1 -; CHECK-NEXT: } -; CHECK-NEXT: { -; CHECK-NEXT: if (p0) r0 = #0 -; CHECK-NEXT: jumpr r31 -; CHECK-NEXT: } -; CHECK-NEXT: .LBB0_2: // %b14 -; CHECK-NEXT: { -; CHECK-NEXT: r0 = #0 +; CHECK-NEXT: r0 = !cmp.eq(r0,#11) ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } b0: diff --git a/llvm/test/Transforms/JumpThreading/pr79175.ll b/llvm/test/Transforms/JumpThreading/pr79175.ll index 2c7ee0770cdc73..cce30ce079999c 100644 --- a/llvm/test/Transforms/JumpThreading/pr79175.ll +++ b/llvm/test/Transforms/JumpThreading/pr79175.ll @@ -17,11 +17,11 @@ define i32 @test(i64 %idx, i32 %val) { ; CHECK: cond.end: ; CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[VAL]], 0 ; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[CMP_I]] -; CHECK-NEXT: br i1 [[COND_FR]], label [[COND_END_THREAD]], label [[TMP0:%.*]] -; CHECK: cond.end.thread: -; CHECK-NEXT: br label [[TMP0]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[TMP0:%.*]], label [[COND_END_THREAD]] ; CHECK: 0: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[COND_END_THREAD]] ], [ [[VAL]], [[COND_END]] ] +; CHECK-NEXT: br label [[COND_END_THREAD]] +; CHECK: cond.end.thread: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[VAL]], [[COND_END]] ], [ 0, [[TMP0]] ], [ 0, [[FOR_BODY]] ] ; CHECK-NEXT: [[F_IDX:%.*]] = getelementptr inbounds i32, ptr @f, i64 [[IDX]] ; CHECK-NEXT: store i32 [[TMP1]], ptr [[F_IDX]], align 4 ; CHECK-NEXT: [[F_RELOAD:%.*]] = load i32, ptr @f, align 4 diff --git a/llvm/test/Transforms/JumpThreading/select.ll b/llvm/test/Transforms/JumpThreading/select.ll index 4ec55a66bb8ac1..27ebf4c25da509 100644 --- a/llvm/test/Transforms/JumpThreading/select.ll +++ b/llvm/test/Transforms/JumpThreading/select.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -S -passes="jump-threading" -debug-only=branch-prob < %s 2>&1 | FileCheck %s -; RUN: opt -S -passes="require,jump-threading" -debug-only=branch-prob < %s 2>&1 | FileCheck -check-prefixes=CHECK,CHECK-BPI %s +; RUN: opt -S -passes="require,jump-threading" -debug-only=branch-prob -disable-output < %s 2>&1 | FileCheck -check-prefix=CHECK-BPI %s ; REQUIRES: asserts ; CHECK-BPI-LABEL: ---- Branch Probability Info : unfold1 ---- @@ -21,7 +21,7 @@ declare void @quux() ; booleans where at least one operand is true/false/undef. ;. -; CHECK: @[[ANCHOR:[a-zA-Z0-9_$"\\.-]+]] = constant [3 x ptr] [ptr blockaddress(@test_indirectbr, [[L1:%.*]]), ptr inttoptr (i32 1 to ptr), ptr blockaddress(@test_indirectbr, [[L3:%.*]])] +; CHECK: @anchor = constant [3 x ptr] [ptr blockaddress(@test_indirectbr, %L1), ptr inttoptr (i32 1 to ptr), ptr blockaddress(@test_indirectbr, %L3)] ;. define void @test_br(i1 %cond, i1 %value) nounwind { ; CHECK-LABEL: @test_br( @@ -66,8 +66,8 @@ define void @test_switch(i1 %cond, i8 %value) nounwind { ; CHECK-NEXT: call void @quux() ; CHECK-NEXT: [[EXPR:%.*]] = select i1 [[COND]], i8 1, i8 [[VALUE:%.*]] ; CHECK-NEXT: switch i8 [[EXPR]], label [[L3:%.*]] [ -; CHECK-NEXT: i8 1, label [[L1]] -; CHECK-NEXT: i8 2, label [[L2:%.*]] +; CHECK-NEXT: i8 1, label [[L1]] +; CHECK-NEXT: i8 2, label [[L2:%.*]] ; CHECK-NEXT: ] ; CHECK: L1: ; CHECK-NEXT: call void @foo() @@ -192,8 +192,8 @@ define void @test_switch_cmp(i1 %cond, i32 %val, i8 %value) nounwind { ; CHECK: 0: ; CHECK-NEXT: [[TMP1:%.*]] = phi i8 [ [[VALUE:%.*]], [[L0]] ] ; CHECK-NEXT: switch i8 [[TMP1]], label [[L3:%.*]] [ -; CHECK-NEXT: i8 1, label [[L1]] -; CHECK-NEXT: i8 2, label [[L2:%.*]] +; CHECK-NEXT: i8 1, label [[L1]] +; CHECK-NEXT: i8 2, label [[L2:%.*]] ; CHECK-NEXT: ] ; CHECK: L1: ; CHECK-NEXT: call void @foo() @@ -237,8 +237,8 @@ define void @test_switch_default(ptr nocapture %status) nounwind { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[STATUS:%.*]], align 4 ; CHECK-NEXT: switch i32 [[TMP0]], label [[L2:%.*]] [ -; CHECK-NEXT: i32 5061, label [[L2_THREAD:%.*]] -; CHECK-NEXT: i32 0, label [[L2]] +; CHECK-NEXT: i32 5061, label [[L2_THREAD:%.*]] +; CHECK-NEXT: i32 0, label [[L2]] ; CHECK-NEXT: ] ; CHECK: L2.thread: ; CHECK-NEXT: store i32 10025, ptr [[STATUS]], align 4 @@ -377,21 +377,21 @@ define i32 @unfold3(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) noun ; CHECK-NEXT: br i1 [[CMP_I]], label [[DOTEXIT_THREAD4:%.*]], label [[COND_FALSE_I:%.*]] ; CHECK: cond.false.i: ; CHECK-NEXT: [[CMP4_I:%.*]] = icmp sgt i32 [[U]], [[V]] -; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD:%.*]], label [[COND_FALSE_6_I:%.*]] +; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD4]], label [[COND_FALSE_6_I:%.*]] ; CHECK: cond.false.6.i: ; CHECK-NEXT: [[CMP8_I:%.*]] = icmp slt i32 [[W:%.*]], [[X:%.*]] ; CHECK-NEXT: br i1 [[CMP8_I]], label [[DOTEXIT_THREAD4]], label [[COND_FALSE_10_I:%.*]] ; CHECK: cond.false.10.i: ; CHECK-NEXT: [[CMP13_I:%.*]] = icmp sgt i32 [[W]], [[X]] -; CHECK-NEXT: br i1 [[CMP13_I]], label [[DOTEXIT_THREAD]], label [[DOTEXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP13_I]], label [[DOTEXIT_THREAD4]], label [[DOTEXIT:%.*]] ; CHECK: .exit: ; CHECK-NEXT: [[PHITMP:%.*]] = icmp sge i32 [[Y:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[PHITMP]] -; CHECK-NEXT: br i1 [[COND_FR]], label [[DOTEXIT_THREAD]], label [[DOTEXIT_THREAD4]] -; CHECK: .exit.thread: +; CHECK-NEXT: br i1 [[COND_FR]], label [[DOTEXIT_THREAD:%.*]], label [[DOTEXIT_THREAD4]] +; CHECK: 0: ; CHECK-NEXT: br label [[DOTEXIT_THREAD4]] -; CHECK: .exit.thread4: -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[J]], [[DOTEXIT_THREAD]] ], [ [[ADD3]], [[DOTEXIT]] ], [ [[ADD3]], [[ENTRY:%.*]] ], [ [[ADD3]], [[COND_FALSE_6_I]] ] +; CHECK: .exit.thread: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[ADD3]], [[DOTEXIT]] ], [ [[J]], [[DOTEXIT_THREAD]] ], [ [[J]], [[COND_FALSE_I]] ], [ [[J]], [[COND_FALSE_10_I]] ], [ [[ADD3]], [[ENTRY:%.*]] ], [ [[ADD3]], [[COND_FALSE_6_I]] ] ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: @@ -430,23 +430,23 @@ define i32 @unfold4(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) noun ; CHECK-NEXT: br i1 [[CMP_I]], label [[DOTEXIT_THREAD:%.*]], label [[COND_FALSE_I:%.*]] ; CHECK: cond.false.i: ; CHECK-NEXT: [[CMP4_I:%.*]] = icmp sgt i32 [[U]], [[V]] -; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD5:%.*]], label [[COND_FALSE_6_I:%.*]] +; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD]], label [[COND_FALSE_6_I:%.*]] ; CHECK: cond.false.6.i: ; CHECK-NEXT: [[CMP8_I:%.*]] = icmp slt i32 [[W:%.*]], [[X:%.*]] ; CHECK-NEXT: br i1 [[CMP8_I]], label [[DOTEXIT_THREAD]], label [[COND_FALSE_10_I:%.*]] ; CHECK: cond.false.10.i: ; CHECK-NEXT: [[CMP13_I:%.*]] = icmp sgt i32 [[W]], [[X]] -; CHECK-NEXT: br i1 [[CMP13_I]], label [[DOTEXIT_THREAD5]], label [[DOTEXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP13_I]], label [[DOTEXIT_THREAD]], label [[DOTEXIT:%.*]] ; CHECK: .exit: ; CHECK-NEXT: [[CMP19_I:%.*]] = icmp sge i32 [[Y:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP19_I]] to i32 ; CHECK-NEXT: [[LNOT_I18:%.*]] = icmp eq i32 [[CONV]], 1 ; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[LNOT_I18]] -; CHECK-NEXT: br i1 [[COND_FR]], label [[DOTEXIT_THREAD]], label [[DOTEXIT_THREAD5]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[TMP1:%.*]], label [[DOTEXIT_THREAD]] +; CHECK: 0: +; CHECK-NEXT: br label [[DOTEXIT_THREAD]] ; CHECK: .exit.thread: -; CHECK-NEXT: br label [[DOTEXIT_THREAD5]] -; CHECK: .exit.thread5: -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[J]], [[DOTEXIT_THREAD]] ], [ [[ADD3]], [[DOTEXIT]] ], [ [[ADD3]], [[COND_FALSE_I]] ], [ [[ADD3]], [[COND_FALSE_10_I]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[ADD3]], [[DOTEXIT]] ], [ [[J]], [[TMP1]] ], [ [[J]], [[ENTRY:%.*]] ], [ [[J]], [[COND_FALSE_6_I]] ], [ [[ADD3]], [[COND_FALSE_I]] ], [ [[ADD3]], [[COND_FALSE_10_I]] ] ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: @@ -560,10 +560,10 @@ define void @test_func(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr ; CHECK: if.end: ; CHECK-NEXT: [[LOCAL_VAR_0:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY]] ] ; CHECK-NEXT: switch i32 [[LOCAL_VAR_0]], label [[SW_DEFAULT]] [ -; CHECK-NEXT: i32 2, label [[SW_BB]] -; CHECK-NEXT: i32 4, label [[SW_BB7]] -; CHECK-NEXT: i32 5, label [[SW_BB8:%.*]] -; CHECK-NEXT: i32 7, label [[SW_BB9:%.*]] +; CHECK-NEXT: i32 2, label [[SW_BB]] +; CHECK-NEXT: i32 4, label [[SW_BB7]] +; CHECK-NEXT: i32 5, label [[SW_BB8:%.*]] +; CHECK-NEXT: i32 7, label [[SW_BB9:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb: ; CHECK-NEXT: call void @foo() @@ -674,3 +674,5 @@ if.end: ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1984} ; CHECK: [[PROF1]] = !{!"branch_weights", i64 1073741824, i64 3221225472} ;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-BPI: {{.*}} diff --git a/llvm/test/Transforms/JumpThreading/thread-prob-7.ll b/llvm/test/Transforms/JumpThreading/thread-prob-7.ll index 8c9d89871d00b3..4623a579be48f6 100644 --- a/llvm/test/Transforms/JumpThreading/thread-prob-7.ll +++ b/llvm/test/Transforms/JumpThreading/thread-prob-7.ll @@ -14,15 +14,15 @@ define i32 @func0(i32 %a0, i32 %a1) !prof !0 { ; CHECK-NEXT: br i1 [[CMP1]], label [[BB_JOIN_THREAD:%.*]], label [[TEST2_FALSE:%.*]], !prof [[PROF2:![0-9]+]] ; CHECK: test2_false: ; CHECK-NEXT: call void @foobar() -; CHECK-NEXT: br label [[TMP0:%.*]] +; CHECK-NEXT: br label [[BB_JOIN_THREAD]] ; CHECK: bb_join: ; CHECK-NEXT: [[C:%.*]] = phi i1 [ [[CX]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[C]] -; CHECK-NEXT: br i1 [[COND_FR]], label [[BB_JOIN_THREAD]], label [[TMP0]], !prof [[PROF3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[BB_JOIN_THREAD1:%.*]], label [[BB_JOIN_THREAD]], !prof [[PROF3:![0-9]+]] ; CHECK: bb_join.thread: -; CHECK-NEXT: br label [[TMP0]] +; CHECK-NEXT: br label [[BB_JOIN_THREAD]] ; CHECK: 0: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 42, [[BB_JOIN_THREAD]] ], [ 7, [[BB_JOIN]] ], [ 7, [[TEST2_FALSE]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 7, [[BB_JOIN]] ], [ 7, [[TEST2_FALSE]] ], [ 42, [[TEST2]] ], [ 42, [[BB_JOIN_THREAD1]] ] ; CHECK-NEXT: ret i32 [[TMP1]] ; entry: diff --git a/llvm/test/Transforms/JumpThreading/uncond-no-phi.ll b/llvm/test/Transforms/JumpThreading/uncond-no-phi.ll new file mode 100644 index 00000000000000..6104e8f8778bc0 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/uncond-no-phi.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=jump-threading -S < %s | FileCheck %s + +define i1 @if_else(i1 %c, i1 %c1) { +; CHECK-LABEL: define i1 @if_else( +; CHECK-SAME: i1 [[C:%.*]], i1 [[C1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C]], label [[THEN:%.*]], label [[RETURN:%.*]] +; CHECK: then: +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: br i1 [[C1]], label [[ELSE:%.*]], label [[RETURN]] +; CHECK: else: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i1 [ false, [[THEN]] ], [ true, [[ENTRY:%.*]] ], [ true, [[ELSE]] ] +; CHECK-NEXT: ret i1 [[RETVAL_0]] +; +entry: + br i1 %c, label %then, label %else + +then: + call void @dummy() + br i1 %c1, label %else, label %return + +else: + br label %return + +return: + %retval.0 = phi i1 [ true, %else ], [ false, %then ] + ret i1 %retval.0 +} + +define i8 @switch_uncond(i8 %arg) { +; CHECK-LABEL: define i8 @switch_uncond( +; CHECK-SAME: i8 [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i8 [[ARG]], label [[DEFAULT:%.*]] [ +; CHECK-NEXT: i8 0, label [[BB1:%.*]] +; CHECK-NEXT: i8 1, label [[BB3:%.*]] +; CHECK-NEXT: i8 2, label [[BB2:%.*]] +; CHECK-NEXT: i8 3, label [[END:%.*]] +; CHECK-NEXT: ] +; CHECK: default: +; CHECK-NEXT: unreachable +; CHECK: bb: +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: br label [[END]] +; CHECK: bb1: +; CHECK-NEXT: call void @dummy() +; CHECK-NEXT: br label [[END]] +; CHECK: bb2: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI:%.*]] = phi i8 [ 1, [[ENTRY:%.*]] ], [ 0, [[BB3]] ], [ 0, [[BB1]] ], [ 0, [[BB2]] ] +; CHECK-NEXT: ret i8 [[PHI]] +; +entry: + switch i8 %arg, label %default [ + i8 0, label %bb + i8 1, label %bb1 + i8 2, label %bb2 + i8 3, label %end + ] + +default: + unreachable + +bb: + call void @dummy() + br label %bb2 + +bb1: + call void @dummy() + br label %bb2 + +; Predecessors of %bb2 are %bb and %bb1, they are not identical. +; So we can thread %bb2. +bb2: + br label %end + +end: + %phi = phi i8 [ 0, %bb2 ], [ 1, %entry ] + ret i8 %phi +} + +define i8 @switch_uncond_fail(i8 %arg) { +; CHECK-LABEL: define i8 @switch_uncond_fail( +; CHECK-SAME: i8 [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: switch i8 [[ARG]], label [[DEFAULT:%.*]] [ +; CHECK-NEXT: i8 0, label [[BB:%.*]] +; CHECK-NEXT: i8 1, label [[BB]] +; CHECK-NEXT: i8 2, label [[END:%.*]] +; CHECK-NEXT: ] +; CHECK: default: +; CHECK-NEXT: br label [[END]] +; CHECK: bb: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[PHI:%.*]] = phi i8 [ 0, [[BB]] ], [ 1, [[ENTRY:%.*]] ], [ 2, [[DEFAULT]] ] +; CHECK-NEXT: ret i8 [[PHI]] +; +entry: + switch i8 %arg, label %default [ + i8 0, label %bb + i8 1, label %bb + i8 2, label %end + ] + +default: + br label %end + +; Predecessor of %bb is only %entry (though there are two in predecessor list), +; thus it's unthreadable. +bb: + br label %end + +end: + %phi = phi i8 [ 0, %bb ], [ 1, %entry ], [ 2, %default ] + ret i8 %phi +} + +declare void @dummy() diff --git a/llvm/test/Transforms/PhaseOrdering/thread-uncond-bb.ll b/llvm/test/Transforms/PhaseOrdering/thread-uncond-bb.ll new file mode 100644 index 00000000000000..17146d7d5987fc --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/thread-uncond-bb.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -O3 -S | FileCheck %s + +define i32 @thread_uncond_bb_cmp(i1 %c, i32 %v) { +; CHECK-LABEL: define i32 @thread_uncond_bb_cmp( +; CHECK-SAME: i1 [[C:%.*]], i32 [[V:%.*]]) local_unnamed_addr { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C]], label [[DO_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @dummy() +; CHECK-NEXT: br label [[DO_END]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[V]], [[IF_THEN]] ] +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + br i1 %c, label %do.end, label %if.then + +if.then: ; preds = %entry + call void @dummy() + %tobool = icmp eq i32 %v, 0 + br i1 %tobool, label %do.end, label %return + +do.end: ; preds = %entry, %if.then + br label %return + +return: ; preds = %if.then, %do.end + %retval = phi i32 [ 0, %do.end ], [ %v, %if.then ] + ret i32 %retval +} + +define i32 @thread_uncond_bb_cmp_zext(i1 %c, i32 %v) { +; CHECK-LABEL: define i32 @thread_uncond_bb_cmp_zext( +; CHECK-SAME: i1 [[C:%.*]], i32 [[V:%.*]]) local_unnamed_addr { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C]], label [[DO_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @dummy() +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[V]], 0 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[TOBOOL]] to i32 +; CHECK-NEXT: br label [[DO_END]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] +; CHECK-NEXT: ret i32 [[RETVAL]] +; +entry: + br i1 %c, label %do.end, label %if.then + +if.then: ; preds = %entry + call void @dummy() + %tobool = icmp eq i32 %v, 0 + br i1 %tobool, label %do.end, label %return + +do.end: ; preds = %entry, %if.then + br label %return + +return: ; preds = %if.then, %do.end + %retval = phi i32 [ 0, %do.end ], [ 1, %if.then ] + ret i32 %retval +} + +declare void @dummy() From 32b74ca6e41768c91eee8b8ca26235b110a65deb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Mon, 15 Apr 2024 15:41:33 +0200 Subject: [PATCH 077/300] [clang][Interp] Load value from MemberExpr if required --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 17 ++++++++++++++--- clang/test/AST/Interp/cxx03.cpp | 14 ++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index a069f3ec27e721..00c4a9f161304a 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -1272,12 +1272,23 @@ bool ByteCodeExprGen::VisitMemberExpr(const MemberExpr *E) { if (DiscardResult) return this->discard(Base); + // MemberExprs are almost always lvalues, in which case we don't need to + // do the load. But sometimes they aren't. + const auto maybeLoadValue = [&]() -> bool { + if (E->isGLValue()) + return true; + if (std::optional T = classify(E)) + return this->emitLoadPop(*T, E); + return false; + }; + if (const auto *VD = dyn_cast(Member)) { // I am almost confident in saying that a var decl must be static // and therefore registered as a global variable. But this will probably // turn out to be wrong some time in the future, as always. if (auto GlobalIndex = P.getGlobal(VD)) - return this->emitGetPtrGlobal(*GlobalIndex, E); + return this->emitGetPtrGlobal(*GlobalIndex, E) && maybeLoadValue(); + return false; } if (Initializing) { @@ -1295,8 +1306,8 @@ bool ByteCodeExprGen::VisitMemberExpr(const MemberExpr *E) { const Record::Field *F = R->getField(FD); // Leave a pointer to the field on the stack. if (F->Decl->getType()->isReferenceType()) - return this->emitGetFieldPop(PT_Ptr, F->Offset, E); - return this->emitGetPtrField(F->Offset, E); + return this->emitGetFieldPop(PT_Ptr, F->Offset, E) && maybeLoadValue(); + return this->emitGetPtrField(F->Offset, E) && maybeLoadValue(); } return false; diff --git a/clang/test/AST/Interp/cxx03.cpp b/clang/test/AST/Interp/cxx03.cpp index d30cbb2fd7a201..b6aaf0840cfb36 100644 --- a/clang/test/AST/Interp/cxx03.cpp +++ b/clang/test/AST/Interp/cxx03.cpp @@ -10,3 +10,17 @@ namespace NonInitializingMemberExpr { // both-note {{required by}} \ // both-note {{subexpression not valid}} } + + +namespace NonLValueMemberExpr { + struct PODType { + int value; + }; + +#define ATTR __attribute__((require_constant_initialization)) + struct TT1 { + ATTR static const int &subobj_init; + }; + + const int &TT1::subobj_init = PODType().value; +} From 92e96c7bbacbb477265c7e5ff6c49a6de5d4ee69 Mon Sep 17 00:00:00 2001 From: Malay Sanghi Date: Tue, 16 Apr 2024 16:36:17 +0530 Subject: [PATCH 078/300] [X86][GISel] Add DU chain lookups for LOAD & STORE (#87453) For G_LOAD and G_STORE we want this information during regbankselect. Today we treat load dest as integer and insert converts. --------- Co-authored-by: Evgenii Kudriashov --- llvm/include/llvm/CodeGen/GlobalISel/Utils.h | 4 + llvm/lib/CodeGen/GlobalISel/Utils.cpp | 44 +++++ .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 37 ----- llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp | 24 +-- .../PowerPC/GISel/PPCRegisterBankInfo.cpp | 39 +---- .../RISCV/GISel/RISCVRegisterBankInfo.cpp | 40 ----- .../Target/X86/GISel/X86RegisterBankInfo.cpp | 139 +++++++++++++++- .../Target/X86/GISel/X86RegisterBankInfo.h | 16 ++ llvm/test/CodeGen/X86/GlobalISel/fconstant.ll | 13 +- .../regbankselect-sse-intrinsics.ll | 153 ++++++++++++++++++ .../X86/GlobalISel/regbankselect-x87.ll | 29 ++-- 11 files changed, 367 insertions(+), 171 deletions(-) create mode 100644 llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index 807cec3c177d9f..c4174cee5e10c6 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -555,5 +555,9 @@ void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, /// debug users of \p MI by writing the effect of \p MI in a DIExpression. void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI); +/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode, +/// having only floating-point operands. +bool isPreISelGenericFloatingPointOpcode(unsigned Opc); + } // End namespace llvm. #endif diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index c3bc3203b63605..ae43e9ccf6112d 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -1665,3 +1665,47 @@ void llvm::salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI) { } } } + +bool llvm::isPreISelGenericFloatingPointOpcode(unsigned Opc) { + switch (Opc) { + case TargetOpcode::G_FABS: + case TargetOpcode::G_FADD: + case TargetOpcode::G_FCANONICALIZE: + case TargetOpcode::G_FCEIL: + case TargetOpcode::G_FCONSTANT: + case TargetOpcode::G_FCOPYSIGN: + case TargetOpcode::G_FCOS: + case TargetOpcode::G_FDIV: + case TargetOpcode::G_FEXP2: + case TargetOpcode::G_FEXP: + case TargetOpcode::G_FFLOOR: + case TargetOpcode::G_FLOG10: + case TargetOpcode::G_FLOG2: + case TargetOpcode::G_FLOG: + case TargetOpcode::G_FMA: + case TargetOpcode::G_FMAD: + case TargetOpcode::G_FMAXIMUM: + case TargetOpcode::G_FMAXNUM: + case TargetOpcode::G_FMAXNUM_IEEE: + case TargetOpcode::G_FMINIMUM: + case TargetOpcode::G_FMINNUM: + case TargetOpcode::G_FMINNUM_IEEE: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_FNEARBYINT: + case TargetOpcode::G_FNEG: + case TargetOpcode::G_FPEXT: + case TargetOpcode::G_FPOW: + case TargetOpcode::G_FPTRUNC: + case TargetOpcode::G_FREM: + case TargetOpcode::G_FRINT: + case TargetOpcode::G_FSIN: + case TargetOpcode::G_FSQRT: + case TargetOpcode::G_FSUB: + case TargetOpcode::G_INTRINSIC_ROUND: + case TargetOpcode::G_INTRINSIC_ROUNDEVEN: + case TargetOpcode::G_INTRINSIC_TRUNC: + return true; + default: + return false; + } +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index d39de770eaf16e..d5c4ce1888e78c 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -424,43 +424,6 @@ void AArch64RegisterBankInfo::applyMappingImpl( } } -/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode, -/// having only floating-point operands. -static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) { - switch (Opc) { - case TargetOpcode::G_FADD: - case TargetOpcode::G_FSUB: - case TargetOpcode::G_FMUL: - case TargetOpcode::G_FMA: - case TargetOpcode::G_FDIV: - case TargetOpcode::G_FCONSTANT: - case TargetOpcode::G_FPEXT: - case TargetOpcode::G_FPTRUNC: - case TargetOpcode::G_FCEIL: - case TargetOpcode::G_FFLOOR: - case TargetOpcode::G_FNEARBYINT: - case TargetOpcode::G_FNEG: - case TargetOpcode::G_FCOS: - case TargetOpcode::G_FSIN: - case TargetOpcode::G_FLOG10: - case TargetOpcode::G_FLOG: - case TargetOpcode::G_FLOG2: - case TargetOpcode::G_FSQRT: - case TargetOpcode::G_FABS: - case TargetOpcode::G_FEXP: - case TargetOpcode::G_FRINT: - case TargetOpcode::G_INTRINSIC_TRUNC: - case TargetOpcode::G_INTRINSIC_ROUND: - case TargetOpcode::G_INTRINSIC_ROUNDEVEN: - case TargetOpcode::G_FMAXNUM: - case TargetOpcode::G_FMINNUM: - case TargetOpcode::G_FMAXIMUM: - case TargetOpcode::G_FMINIMUM: - return true; - } - return false; -} - const RegisterBankInfo::InstructionMapping & AArch64RegisterBankInfo::getSameKindOfOperandsMapping( const MachineInstr &MI) const { diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp index 6af1fd8c88e570..62b58cba9f24a4 100644 --- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp +++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp @@ -104,26 +104,6 @@ MipsRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, } } -// Instructions where all register operands are floating point. -static bool isFloatingPointOpcode(unsigned Opc) { - switch (Opc) { - case TargetOpcode::G_FCONSTANT: - case TargetOpcode::G_FADD: - case TargetOpcode::G_FSUB: - case TargetOpcode::G_FMUL: - case TargetOpcode::G_FDIV: - case TargetOpcode::G_FABS: - case TargetOpcode::G_FSQRT: - case TargetOpcode::G_FCEIL: - case TargetOpcode::G_FFLOOR: - case TargetOpcode::G_FPEXT: - case TargetOpcode::G_FPTRUNC: - return true; - default: - return false; - } -} - // Instructions where use operands are floating point registers. // Def operands are general purpose. static bool isFloatingPointOpcodeUse(unsigned Opc) { @@ -133,7 +113,7 @@ static bool isFloatingPointOpcodeUse(unsigned Opc) { case TargetOpcode::G_FCMP: return true; default: - return isFloatingPointOpcode(Opc); + return isPreISelGenericFloatingPointOpcode(Opc); } } @@ -145,7 +125,7 @@ static bool isFloatingPointOpcodeDef(unsigned Opc) { case TargetOpcode::G_UITOFP: return true; default: - return isFloatingPointOpcode(Opc); + return isPreISelGenericFloatingPointOpcode(Opc); } } diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp index 6aeef145e3078f..125a49de7b27d4 100644 --- a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp +++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp @@ -13,6 +13,7 @@ #include "PPCRegisterBankInfo.h" #include "PPCRegisterInfo.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" @@ -239,44 +240,6 @@ PPCRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { return getInstructionMapping(MappingID, Cost, OperandsMapping, NumOperands); } -/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode, -/// having only floating-point operands. -/// FIXME: this is copied from target AArch64. Needs some code refactor here to -/// put this function in GlobalISel/Utils.cpp. -static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) { - switch (Opc) { - case TargetOpcode::G_FADD: - case TargetOpcode::G_FSUB: - case TargetOpcode::G_FMUL: - case TargetOpcode::G_FMA: - case TargetOpcode::G_FDIV: - case TargetOpcode::G_FCONSTANT: - case TargetOpcode::G_FPEXT: - case TargetOpcode::G_FPTRUNC: - case TargetOpcode::G_FCEIL: - case TargetOpcode::G_FFLOOR: - case TargetOpcode::G_FNEARBYINT: - case TargetOpcode::G_FNEG: - case TargetOpcode::G_FCOS: - case TargetOpcode::G_FSIN: - case TargetOpcode::G_FLOG10: - case TargetOpcode::G_FLOG: - case TargetOpcode::G_FLOG2: - case TargetOpcode::G_FSQRT: - case TargetOpcode::G_FABS: - case TargetOpcode::G_FEXP: - case TargetOpcode::G_FRINT: - case TargetOpcode::G_INTRINSIC_TRUNC: - case TargetOpcode::G_INTRINSIC_ROUND: - case TargetOpcode::G_FMAXNUM: - case TargetOpcode::G_FMINNUM: - case TargetOpcode::G_FMAXIMUM: - case TargetOpcode::G_FMINIMUM: - return true; - } - return false; -} - /// \returns true if a given intrinsic \p ID only uses and defines FPRs. static bool isFPIntrinsic(unsigned ID) { // TODO: Add more intrinsics. diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp index 86e44343b50865..cc534f29685f25 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp @@ -154,46 +154,6 @@ static const RegisterBankInfo::ValueMapping *getFPValueMapping(unsigned Size) { return &RISCV::ValueMappings[Idx]; } -/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode, -/// having only floating-point operands. -/// FIXME: this is copied from target AArch64. Needs some code refactor here to -/// put this function in GlobalISel/Utils.cpp. -static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) { - switch (Opc) { - case TargetOpcode::G_FADD: - case TargetOpcode::G_FSUB: - case TargetOpcode::G_FMUL: - case TargetOpcode::G_FMA: - case TargetOpcode::G_FDIV: - case TargetOpcode::G_FCONSTANT: - case TargetOpcode::G_FPEXT: - case TargetOpcode::G_FPTRUNC: - case TargetOpcode::G_FCEIL: - case TargetOpcode::G_FFLOOR: - case TargetOpcode::G_FNEARBYINT: - case TargetOpcode::G_FNEG: - case TargetOpcode::G_FCOPYSIGN: - case TargetOpcode::G_FCOS: - case TargetOpcode::G_FSIN: - case TargetOpcode::G_FLOG10: - case TargetOpcode::G_FLOG: - case TargetOpcode::G_FLOG2: - case TargetOpcode::G_FSQRT: - case TargetOpcode::G_FABS: - case TargetOpcode::G_FEXP: - case TargetOpcode::G_FRINT: - case TargetOpcode::G_INTRINSIC_TRUNC: - case TargetOpcode::G_INTRINSIC_ROUND: - case TargetOpcode::G_INTRINSIC_ROUNDEVEN: - case TargetOpcode::G_FMAXNUM: - case TargetOpcode::G_FMINNUM: - case TargetOpcode::G_FMAXIMUM: - case TargetOpcode::G_FMINIMUM: - return true; - } - return false; -} - // TODO: Make this more like AArch64? bool RISCVRegisterBankInfo::hasFPConstraints( const MachineInstr &MI, const MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp index e7c9e60ba95f16..9e85424e76e620 100644 --- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp @@ -13,10 +13,13 @@ #include "X86RegisterBankInfo.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterBank.h" #include "llvm/CodeGen/RegisterBankInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/IR/IntrinsicsX86.h" #define GET_TARGET_REGBANK_IMPL #include "X86GenRegisterBank.inc" @@ -68,6 +71,98 @@ X86RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, llvm_unreachable("Unsupported register kind yet."); } +// \returns true if a given intrinsic only uses and defines FPRs. +static bool isFPIntrinsic(const MachineRegisterInfo &MRI, + const MachineInstr &MI) { + // TODO: Add more intrinsics. + switch (cast(MI).getIntrinsicID()) { + default: + return false; + // SSE1 + case Intrinsic::x86_sse_rcp_ss: + case Intrinsic::x86_sse_rcp_ps: + case Intrinsic::x86_sse_rsqrt_ss: + case Intrinsic::x86_sse_rsqrt_ps: + case Intrinsic::x86_sse_min_ss: + case Intrinsic::x86_sse_min_ps: + case Intrinsic::x86_sse_max_ss: + case Intrinsic::x86_sse_max_ps: + return true; + } + return false; +} + +bool X86RegisterBankInfo::hasFPConstraints(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + unsigned Depth) const { + unsigned Op = MI.getOpcode(); + if (Op == TargetOpcode::G_INTRINSIC && isFPIntrinsic(MRI, MI)) + return true; + + // Do we have an explicit floating point instruction? + if (isPreISelGenericFloatingPointOpcode(Op)) + return true; + + // No. Check if we have a copy-like instruction. If we do, then we could + // still be fed by floating point instructions. + if (Op != TargetOpcode::COPY && !MI.isPHI() && + !isPreISelGenericOptimizationHint(Op)) + return false; + + // Check if we already know the register bank. + auto *RB = getRegBank(MI.getOperand(0).getReg(), MRI, TRI); + if (RB == &getRegBank(X86::PSRRegBankID)) + return true; + if (RB == &getRegBank(X86::GPRRegBankID)) + return false; + + // We don't know anything. + // + // If we have a phi, we may be able to infer that it will be assigned a fp + // type based off of its inputs. + if (!MI.isPHI() || Depth > MaxFPRSearchDepth) + return false; + + return any_of(MI.explicit_uses(), [&](const MachineOperand &Op) { + return Op.isReg() && + onlyDefinesFP(*MRI.getVRegDef(Op.getReg()), MRI, TRI, Depth + 1); + }); +} + +bool X86RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + unsigned Depth) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_FPTOSI: + case TargetOpcode::G_FPTOUI: + case TargetOpcode::G_FCMP: + case TargetOpcode::G_LROUND: + case TargetOpcode::G_LLROUND: + case TargetOpcode::G_INTRINSIC_TRUNC: + case TargetOpcode::G_INTRINSIC_ROUND: + return true; + default: + break; + } + return hasFPConstraints(MI, MRI, TRI, Depth); +} + +bool X86RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + unsigned Depth) const { + switch (MI.getOpcode()) { + case TargetOpcode::G_SITOFP: + case TargetOpcode::G_UITOFP: + return true; + default: + break; + } + return hasFPConstraints(MI, MRI, TRI, Depth); +} + X86GenRegisterBankInfo::PartialMappingIdx X86GenRegisterBankInfo::getPartialMappingIdx(const MachineInstr &MI, const LLT &Ty, bool isFP) { @@ -180,11 +275,13 @@ X86RegisterBankInfo::getSameOperandsMapping(const MachineInstr &MI, const RegisterBankInfo::InstructionMapping & X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); const MachineRegisterInfo &MRI = MF.getRegInfo(); unsigned Opc = MI.getOpcode(); - // Try the default logic for non-generic instructions that are either copies - // or already have some operands assigned to banks. + // Try the default logic for non-generic instructions that are either + // copies or already have some operands assigned to banks. if (!isPreISelGenericOpcode(Opc) || Opc == TargetOpcode::G_PHI) { const InstructionMapping &Mapping = getInstrMappingImpl(MI); if (Mapping.isValid()) @@ -221,13 +318,14 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_FPEXT: case TargetOpcode::G_FPTRUNC: case TargetOpcode::G_FCONSTANT: - // Instruction having only floating-point operands (all scalars in VECRReg) + // Instruction having only floating-point operands (all scalars in + // VECRReg) getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ true, OpRegBankIdx); break; case TargetOpcode::G_SITOFP: case TargetOpcode::G_FPTOSI: { - // Some of the floating-point instructions have mixed GPR and FP operands: - // fine-tune the computed mapping. + // Some of the floating-point instructions have mixed GPR and FP + // operands: fine-tune the computed mapping. auto &Op0 = MI.getOperand(0); auto &Op1 = MI.getOperand(1); const LLT Ty0 = MRI.getType(Op0.getReg()); @@ -271,9 +369,36 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ isFPTrunc || isFPAnyExt, OpRegBankIdx); - } break; + break; + } + case TargetOpcode::G_LOAD: { + // Check if that load feeds fp instructions. + // In that case, we want the default mapping to be on FPR + // instead of blind map every scalar to GPR. + bool IsFP = any_of(MRI.use_nodbg_instructions(cast(MI).getDstReg()), + [&](const MachineInstr &UseMI) { + // If we have at least one direct use in a FP + // instruction, assume this was a floating point load + // in the IR. If it was not, we would have had a + // bitcast before reaching that instruction. + return onlyUsesFP(UseMI, MRI, TRI); + }); + getInstrPartialMappingIdxs(MI, MRI, IsFP, OpRegBankIdx); + break; + } + case TargetOpcode::G_STORE: { + // Check if that store is fed by fp instructions. + Register VReg = cast(MI).getValueReg(); + if (!VReg) + break; + MachineInstr *DefMI = MRI.getVRegDef(VReg); + bool IsFP = onlyDefinesFP(*DefMI, MRI, TRI); + getInstrPartialMappingIdxs(MI, MRI, IsFP, OpRegBankIdx); + break; + } default: - // Track the bank of each register, use NotFP mapping (all scalars in GPRs) + // Track the bank of each register, use NotFP mapping (all scalars in + // GPRs) getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ false, OpRegBankIdx); break; } diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h index 989c5956ad5917..8f38e717e36b0b 100644 --- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h +++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h @@ -62,6 +62,22 @@ class X86RegisterBankInfo final : public X86GenRegisterBankInfo { const SmallVectorImpl &OpRegBankIdx, SmallVectorImpl &OpdsMapping); + // Maximum recursion depth for hasFPConstraints. + const unsigned MaxFPRSearchDepth = 2; + + /// \returns true if \p MI only uses and defines FPRs. + bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + unsigned Depth = 0) const; + + /// \returns true if \p MI only uses FPRs. + bool onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, unsigned Depth = 0) const; + + /// \returns true if \p MI only defines FPRs. + bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, unsigned Depth = 0) const; + public: X86RegisterBankInfo(const TargetRegisterInfo &TRI); diff --git a/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll b/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll index a9b2037e9947a1..8d2ee3c50f215a 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll @@ -10,27 +10,22 @@ define void @test_float(ptr %a , float %b) { ; CHECK64_SMALL: # %bb.0: # %entry ; CHECK64_SMALL-NEXT: movss {{.*#+}} xmm1 = [5.5E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK64_SMALL-NEXT: addss %xmm0, %xmm1 -; CHECK64_SMALL-NEXT: movd %xmm1, %eax -; CHECK64_SMALL-NEXT: movl %eax, (%rdi) +; CHECK64_SMALL-NEXT: movss %xmm1, (%rdi) ; CHECK64_SMALL-NEXT: retq ; ; CHECK64_LARGE-LABEL: test_float: ; CHECK64_LARGE: # %bb.0: # %entry ; CHECK64_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; CHECK64_LARGE-NEXT: addss (%rax), %xmm0 -; CHECK64_LARGE-NEXT: movd %xmm0, %eax -; CHECK64_LARGE-NEXT: movl %eax, (%rdi) +; CHECK64_LARGE-NEXT: movss %xmm0, (%rdi) ; CHECK64_LARGE-NEXT: retq ; ; CHECK32-LABEL: test_float: ; CHECK32: # %bb.0: # %entry ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movss {{.*#+}} xmm0 = [5.5E+0,0.0E+0,0.0E+0,0.0E+0] -; CHECK32-NEXT: movd %ecx, %xmm1 -; CHECK32-NEXT: addss %xmm0, %xmm1 -; CHECK32-NEXT: movd %xmm1, %ecx -; CHECK32-NEXT: movl %ecx, (%eax) +; CHECK32-NEXT: addss {{[0-9]+}}(%esp), %xmm0 +; CHECK32-NEXT: movss %xmm0, (%eax) ; CHECK32-NEXT: retl entry: %aa = fadd float 5.500000e+00, %b diff --git a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll new file mode 100644 index 00000000000000..3388af605d9691 --- /dev/null +++ b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll @@ -0,0 +1,153 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=i686-- -mattr=+sse -global-isel -stop-after=regbankselect | FileCheck %s + +define void @test_x86_sse_max_ps(ptr %p1, ptr %p2) { + ; CHECK-LABEL: name: test_x86_sse_max_ps + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1) + ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1) + ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.max.ps), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1) + ; CHECK-NEXT: RET 0 + %a0 = load <4 x float>, ptr %p1, align 16 + %a1 = load <4 x float>, ptr %p2, align 16 + %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] + store <4 x float> %res, ptr %p1 + ret void +} +declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone + + +define void @test_x86_sse_max_ss(ptr %p1, ptr %p2) { + ; CHECK-LABEL: name: test_x86_sse_max_ss + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1) + ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1) + ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.max.ss), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1) + ; CHECK-NEXT: RET 0 + %a0 = load <4 x float>, ptr %p1, align 16 + %a1 = load <4 x float>, ptr %p2, align 16 + %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] + store <4 x float> %res, ptr %p1 + ret void +} +declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone + + +define void @test_x86_sse_min_ps(ptr %p1, ptr %p2) { + ; CHECK-LABEL: name: test_x86_sse_min_ps + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1) + ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1) + ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.min.ps), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1) + ; CHECK-NEXT: RET 0 + %a0 = load <4 x float>, ptr %p1, align 16 + %a1 = load <4 x float>, ptr %p2, align 16 + %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] + store <4 x float> %res, ptr %p1 + ret void +} +declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone + + +define void @test_x86_sse_min_ss(ptr %p1, ptr %p2) { + ; CHECK-LABEL: name: test_x86_sse_min_ss + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1) + ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1) + ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.min.ss), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>) + ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1) + ; CHECK-NEXT: RET 0 + %a0 = load <4 x float>, ptr %p1, align 16 + %a1 = load <4 x float>, ptr %p2, align 16 + %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1] + store <4 x float> %res, ptr %p1 + ret void +} +declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone + + +define void @test_x86_sse_rcp_ps(ptr %p1, ptr %p2) { + ; CHECK-LABEL: name: test_x86_sse_rcp_ps + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rcp.ps), [[LOAD1]](<4 x s32>) + ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1) + ; CHECK-NEXT: RET 0 + %a0 = load <4 x float>, ptr %p1, align 16 + %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] + store <4 x float> %res, ptr %p1 + ret void +} +declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone + + +define void @test_x86_sse_rcp_ss(ptr %p1, ptr %p2) { + ; CHECK-LABEL: name: test_x86_sse_rcp_ss + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rcp.ss), [[LOAD1]](<4 x s32>) + ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1) + ; CHECK-NEXT: RET 0 + %a0 = load <4 x float>, ptr %p1, align 16 + %res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] + store <4 x float> %res, ptr %p1 + ret void +} +declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone + + +define void @test_x86_sse_rsqrt_ps(ptr %p1, ptr %p2) { + ; CHECK-LABEL: name: test_x86_sse_rsqrt_ps + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rsqrt.ps), [[LOAD1]](<4 x s32>) + ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1) + ; CHECK-NEXT: RET 0 + %a0 = load <4 x float>, ptr %p1, align 16 + %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] + store <4 x float> %res, ptr %p1 + ret void +} +declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone + + +define void @test_x86_sse_rsqrt_ss(ptr %p1, ptr %p2) { + ; CHECK-LABEL: name: test_x86_sse_rsqrt_ss + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1) + ; CHECK-NEXT: [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rsqrt.ss), [[LOAD1]](<4 x s32>) + ; CHECK-NEXT: G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1) + ; CHECK-NEXT: RET 0 + %a0 = load <4 x float>, ptr %p1, align 16 + %res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1] + store <4 x float> %res, ptr %p1 + ret void +} +declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone diff --git a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll index d09db0f2474c96..99d458a183a9bd 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll @@ -142,7 +142,7 @@ define float @f4(float %val) { ; X86-LABEL: name: f4 ; X86: bb.1 (%ir-block.0): ; X86-NEXT: [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0 - ; X86-NEXT: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (s32) from %fixed-stack.0) + ; X86-NEXT: [[LOAD:%[0-9]+]]:psr(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (s32) from %fixed-stack.0) ; X86-NEXT: $fp0 = COPY [[LOAD]](s32) ; X86-NEXT: RET 0, implicit $fp0 ; @@ -187,13 +187,10 @@ define void @f5(ptr %a, ptr %b) { ; X64-NEXT: {{ $}} ; X64-NEXT: [[COPY:%[0-9]+]]:gpr(p0) = COPY $rdi ; X64-NEXT: [[COPY1:%[0-9]+]]:gpr(p0) = COPY $rsi - ; X64-NEXT: [[LOAD:%[0-9]+]]:gpr(s64) = G_LOAD [[COPY]](p0) :: (load (s64) from %ir.a) - ; X64-NEXT: [[LOAD1:%[0-9]+]]:gpr(s64) = G_LOAD [[COPY1]](p0) :: (load (s64) from %ir.b) - ; X64-NEXT: [[COPY2:%[0-9]+]]:psr(s64) = COPY [[LOAD]](s64) - ; X64-NEXT: [[COPY3:%[0-9]+]]:psr(s64) = COPY [[LOAD1]](s64) - ; X64-NEXT: [[FADD:%[0-9]+]]:psr(s64) = G_FADD [[COPY2]], [[COPY3]] - ; X64-NEXT: [[COPY4:%[0-9]+]]:gpr(s64) = COPY [[FADD]](s64) - ; X64-NEXT: G_STORE [[COPY4]](s64), [[COPY]](p0) :: (store (s64) into %ir.a) + ; X64-NEXT: [[LOAD:%[0-9]+]]:psr(s64) = G_LOAD [[COPY]](p0) :: (load (s64) from %ir.a) + ; X64-NEXT: [[LOAD1:%[0-9]+]]:psr(s64) = G_LOAD [[COPY1]](p0) :: (load (s64) from %ir.b) + ; X64-NEXT: [[FADD:%[0-9]+]]:psr(s64) = G_FADD [[LOAD]], [[LOAD1]] + ; X64-NEXT: G_STORE [[FADD]](s64), [[COPY]](p0) :: (store (s64) into %ir.a) ; X64-NEXT: RET 0 %load1 = load double, ptr %a, align 8 %load2 = load double, ptr %b, align 8 @@ -210,11 +207,9 @@ define void @f6(ptr %0, ptr %1) { ; X86-NEXT: [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0 ; X86-NEXT: [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0) ; X86-NEXT: [[C:%[0-9]+]]:psr(s32) = G_FCONSTANT float 2.000000e+01 - ; X86-NEXT: [[LOAD2:%[0-9]+]]:gpr(s32) = G_LOAD [[LOAD]](p0) :: (load (s32) from %ir.0) - ; X86-NEXT: [[COPY:%[0-9]+]]:psr(s32) = COPY [[LOAD2]](s32) - ; X86-NEXT: [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[COPY]], [[C]] - ; X86-NEXT: [[COPY1:%[0-9]+]]:gpr(s32) = COPY [[FADD]](s32) - ; X86-NEXT: G_STORE [[COPY1]](s32), [[LOAD1]](p0) :: (store (s32) into %ir.1) + ; X86-NEXT: [[LOAD2:%[0-9]+]]:psr(s32) = G_LOAD [[LOAD]](p0) :: (load (s32) from %ir.0) + ; X86-NEXT: [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[LOAD2]], [[C]] + ; X86-NEXT: G_STORE [[FADD]](s32), [[LOAD1]](p0) :: (store (s32) into %ir.1) ; X86-NEXT: RET 0 ; ; X64-LABEL: name: f6 @@ -224,11 +219,9 @@ define void @f6(ptr %0, ptr %1) { ; X64-NEXT: [[COPY:%[0-9]+]]:gpr(p0) = COPY $rdi ; X64-NEXT: [[COPY1:%[0-9]+]]:gpr(p0) = COPY $rsi ; X64-NEXT: [[C:%[0-9]+]]:psr(s32) = G_FCONSTANT float 2.000000e+01 - ; X64-NEXT: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.0) - ; X64-NEXT: [[COPY2:%[0-9]+]]:psr(s32) = COPY [[LOAD]](s32) - ; X64-NEXT: [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[COPY2]], [[C]] - ; X64-NEXT: [[COPY3:%[0-9]+]]:gpr(s32) = COPY [[FADD]](s32) - ; X64-NEXT: G_STORE [[COPY3]](s32), [[COPY1]](p0) :: (store (s32) into %ir.1) + ; X64-NEXT: [[LOAD:%[0-9]+]]:psr(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.0) + ; X64-NEXT: [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[LOAD]], [[C]] + ; X64-NEXT: G_STORE [[FADD]](s32), [[COPY1]](p0) :: (store (s32) into %ir.1) ; X64-NEXT: RET 0 %load1 = load float, ptr %0 %add = fadd float %load1, 20.0 From ac6b4c618a52c62cef9b143a767991dbba7453e1 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Tue, 16 Apr 2024 12:25:14 +0100 Subject: [PATCH 079/300] Reapply "[Verifier] Reject va_start in non-variadic function (#88809)" This reverts commit f4960da6023b8034ae68925c3223d51624621b37. Includes a fix for the MLIR test case. --- llvm/lib/Analysis/Lint.cpp | 5 +---- llvm/lib/IR/Verifier.cpp | 5 +++++ llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll | 2 +- llvm/test/Other/lint.ll | 7 ------- llvm/test/Verifier/variadic.ll | 8 ++++++++ mlir/test/Target/LLVMIR/Import/intrinsic.ll | 2 +- 6 files changed, 16 insertions(+), 13 deletions(-) create mode 100644 llvm/test/Verifier/variadic.ll diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index 0694c2995dfcce..1ab856ac8830a9 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -350,10 +350,7 @@ void Lint::visitCallBase(CallBase &I) { } case Intrinsic::vastart: - Check(I.getParent()->getParent()->isVarArg(), - "Undefined behavior: va_start called in a non-varargs function", - &I); - + // vastart in non-varargs function is rejected by the verifier visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI), std::nullopt, nullptr, MemRef::Read | MemRef::Write); break; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 516d4a0515569b..4cd61e6e531bff 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5798,6 +5798,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { break; } + case Intrinsic::vastart: { + Check(Call.getFunction()->isVarArg(), + "va_start called in a non-varargs function"); + break; + } case Intrinsic::vector_reduce_and: case Intrinsic::vector_reduce_or: case Intrinsic::vector_reduce_xor: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll b/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll index bd576d0f70e9c1..8c6e01d934c2d8 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll @@ -3,7 +3,7 @@ declare void @llvm.va_start(ptr) -define void @test_va_start(ptr %list) { +define void @test_va_start(ptr %list, ...) { ; CHECK-LABEL: name: test_va_start ; CHECK: [[LIST:%[0-9]+]]:_(p0) = COPY $x0 ; CHECK-IOS: G_VASTART [[LIST]](p0) :: (store (s64) into %ir.list, align 1) diff --git a/llvm/test/Other/lint.ll b/llvm/test/Other/lint.ll index 6b31b31a78c98a..6fd2d40cd2f298 100644 --- a/llvm/test/Other/lint.ll +++ b/llvm/test/Other/lint.ll @@ -124,13 +124,6 @@ define void @0() nounwind { ret void } -; CHECK: va_start called in a non-varargs function -declare void @llvm.va_start(ptr) -define void @not_vararg(ptr %p) nounwind { - call void @llvm.va_start(ptr %p) - ret void -} - ; CHECK: Undefined behavior: Branch to non-blockaddress define void @use_indbr() { indirectbr ptr @foo, [label %block] diff --git a/llvm/test/Verifier/variadic.ll b/llvm/test/Verifier/variadic.ll new file mode 100644 index 00000000000000..55e4a4da0a9203 --- /dev/null +++ b/llvm/test/Verifier/variadic.ll @@ -0,0 +1,8 @@ +; RUN: not opt -S -passes=verify 2>&1 < %s | FileCheck %s + +; CHECK: va_start called in a non-varargs function +declare void @llvm.va_start(ptr) +define void @not_vararg(ptr %p) nounwind { + call void @llvm.va_start(ptr %p) + ret void +} diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll index 81a6eadbadd3fc..bf6847a32ff4fc 100644 --- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll +++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll @@ -597,7 +597,7 @@ define void @ushl_sat_test(i32 %0, i32 %1, <8 x i32> %2, <8 x i32> %3) { } ; CHECK-LABEL: llvm.func @va_intrinsics_test -define void @va_intrinsics_test(ptr %0, ptr %1) { +define void @va_intrinsics_test(ptr %0, ptr %1, ...) { ; CHECK: llvm.intr.vastart %{{.*}} call void @llvm.va_start.p0(ptr %0) ; CHECK: llvm.intr.vacopy %{{.*}} to %{{.*}} From 09e7d7585cf881fb598eb56738579b84d027318c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 16 Apr 2024 07:52:39 +0200 Subject: [PATCH 080/300] [clang][Interp] Don't add 'in call to' diagnostics for builtin frames --- clang/lib/AST/Interp/InterpFrame.cpp | 7 ++++++ clang/lib/AST/Interp/State.cpp | 5 +++-- clang/test/AST/Interp/builtin-functions.cpp | 24 +++++++-------------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp index 12e2e6ff9155b9..ba957546473e98 100644 --- a/clang/lib/AST/Interp/InterpFrame.cpp +++ b/clang/lib/AST/Interp/InterpFrame.cpp @@ -152,6 +152,13 @@ void print(llvm::raw_ostream &OS, const Pointer &P, ASTContext &Ctx, } void InterpFrame::describe(llvm::raw_ostream &OS) const { + // We create frames for builtin functions as well, but we can't reliably + // diagnose them. The 'in call to' diagnostics for them add no value to the + // user _and_ it doesn't generally work since the argument types don't always + // match the function prototype. Just ignore them. + if (const auto *F = getFunction(); F && F->isBuiltin()) + return; + const FunctionDecl *F = getCallee(); if (const auto *M = dyn_cast(F); M && M->isInstance() && !isa(F)) { diff --git a/clang/lib/AST/Interp/State.cpp b/clang/lib/AST/Interp/State.cpp index 47fbf5145cd4e4..0d9dadec4b9581 100644 --- a/clang/lib/AST/Interp/State.cpp +++ b/clang/lib/AST/Interp/State.cpp @@ -155,7 +155,8 @@ void State::addCallStack(unsigned Limit) { SmallString<128> Buffer; llvm::raw_svector_ostream Out(Buffer); F->describe(Out); - addDiag(CallRange.getBegin(), diag::note_constexpr_call_here) - << Out.str() << CallRange; + if (!Buffer.empty()) + addDiag(CallRange.getBegin(), diag::note_constexpr_call_here) + << Out.str() << CallRange; } } diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp index a7adc92d3714fa..1a29a664d7ce54 100644 --- a/clang/test/AST/Interp/builtin-functions.cpp +++ b/clang/test/AST/Interp/builtin-functions.cpp @@ -24,16 +24,13 @@ namespace strcmp { static_assert(__builtin_strcmp("abab", "abab\0banana") == 0, ""); static_assert(__builtin_strcmp("abab\0banana", "abab\0canada") == 0, ""); static_assert(__builtin_strcmp(0, "abab") == 0, ""); // both-error {{not an integral constant}} \ - // both-note {{dereferenced null}} \ - // expected-note {{in call to}} + // both-note {{dereferenced null}} static_assert(__builtin_strcmp("abab", 0) == 0, ""); // both-error {{not an integral constant}} \ - // both-note {{dereferenced null}} \ - // expected-note {{in call to}} + // both-note {{dereferenced null}} static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar) == -1, ""); static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar + 6) == 0, ""); // both-error {{not an integral constant}} \ - // both-note {{dereferenced one-past-the-end}} \ - // expected-note {{in call to}} + // both-note {{dereferenced one-past-the-end}} /// Used to assert because we're passing a dummy pointer to /// __builtin_strcmp() when evaluating the return statement. @@ -72,14 +69,11 @@ constexpr const char *a = "foo\0quux"; static_assert(check(c), ""); constexpr int over1 = __builtin_strlen(a + 9); // both-error {{constant expression}} \ - // both-note {{one-past-the-end}} \ - // expected-note {{in call to}} + // both-note {{one-past-the-end}} constexpr int over2 = __builtin_strlen(b + 9); // both-error {{constant expression}} \ - // both-note {{one-past-the-end}} \ - // expected-note {{in call to}} + // both-note {{one-past-the-end}} constexpr int over3 = __builtin_strlen(c + 9); // both-error {{constant expression}} \ - // both-note {{one-past-the-end}} \ - // expected-note {{in call to}} + // both-note {{one-past-the-end}} constexpr int under1 = __builtin_strlen(a - 1); // both-error {{constant expression}} \ // both-note {{cannot refer to element -1}} @@ -90,8 +84,7 @@ constexpr const char *a = "foo\0quux"; constexpr char d[] = { 'f', 'o', 'o' }; // no nul terminator. constexpr int bad = __builtin_strlen(d); // both-error {{constant expression}} \ - // both-note {{one-past-the-end}} \ - // expected-note {{in call to}} + // both-note {{one-past-the-end}} } namespace nan { @@ -114,8 +107,7 @@ namespace nan { /// FIXME: Current interpreter misses diagnostics. constexpr char f2[] = {'0', 'x', 'A', 'E'}; /// No trailing 0 byte. constexpr double NaN7 = __builtin_nan(f2); // both-error {{must be initialized by a constant expression}} \ - // expected-note {{read of dereferenced one-past-the-end pointer}} \ - // expected-note {{in call to}} + // expected-note {{read of dereferenced one-past-the-end pointer}} static_assert(!__builtin_issignaling(__builtin_nan("")), ""); static_assert(__builtin_issignaling(__builtin_nans("")), ""); } From a8de3ee8994023ea7669397587f8118ae5bba9c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 16 Apr 2024 13:29:10 +0200 Subject: [PATCH 081/300] [clang][Interp][NFC] Fix some build warnings Fixes: /buildbot/worker/arc-folder/llvm-project/clang/lib/AST/Interp/Disasm.cpp:143:25: warning: cast from type 'const clang::interp::Block*' to type 'void*' casts away qualifiers [-Wcast-qual] /buildbot/worker/arc-folder/llvm-project/clang/lib/AST/Interp/Disasm.cpp:271:23: warning: cast from type 'const clang::interp::Block*' to type 'void*' casts away qualifiers [-Wcast-qual] --- clang/lib/AST/Interp/Disasm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp index ebc4e4f195ba62..d127f33223e802 100644 --- a/clang/lib/AST/Interp/Disasm.cpp +++ b/clang/lib/AST/Interp/Disasm.cpp @@ -140,7 +140,7 @@ LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const { const Descriptor *Desc = G->block()->getDescriptor(); Pointer GP = getPtrGlobal(GI); - OS << GI << ": " << (void *)G->block() << " "; + OS << GI << ": " << (const void *)G->block() << " "; { ColorScope SC(OS, true, GP.isInitialized() @@ -268,7 +268,7 @@ LLVM_DUMP_METHOD void Record::dump(llvm::raw_ostream &OS, unsigned Indentation, LLVM_DUMP_METHOD void Block::dump(llvm::raw_ostream &OS) const { { ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_BLUE, true}); - OS << "Block " << (void *)this << "\n"; + OS << "Block " << (const void *)this << "\n"; } unsigned NPointers = 0; for (const Pointer *P = Pointers; P; P = P->Next) { From c18a3b6bd30456305cf1b3d78ad5a805577388c1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 16 Apr 2024 12:30:13 +0100 Subject: [PATCH 082/300] [DAG] Fold extract_subvector(insert_subvector(x,y,c1),c2) --> extract_subvector(y,c2-c1) (#87925) (REAPPLIED) If the extract_subvector is cheap, attempt to extract directly from an inserted subvector Reapplied with a check to ensure we only attempt this for fixed vectors --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 + .../any_extend_vector_inreg_of_broadcast.ll | 42 +- llvm/test/CodeGen/X86/dpbusd.ll | 2 +- llvm/test/CodeGen/X86/dpbusd_i4.ll | 2 +- .../vector-interleaved-load-i16-stride-3.ll | 1300 ++++++++--------- .../vector-interleaved-store-i8-stride-7.ll | 602 ++++---- .../zero_extend_vector_inreg_of_broadcast.ll | 28 +- 7 files changed, 986 insertions(+), 1006 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0fa0bf2609bb31..c36b1cc9039c26 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24467,6 +24467,22 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { if (!LegalOperations || TLI.isOperationLegal(ISD::SPLAT_VECTOR, NVT)) return DAG.getSplatVector(NVT, DL, V.getOperand(0)); + // extract_subvector(insert_subvector(x,y,c1),c2) + // --> extract_subvector(y,c2-c1) + // iff we're just extracting from the inserted subvector. + if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { + SDValue InsSub = V.getOperand(1); + EVT InsSubVT = InsSub.getValueType(); + unsigned NumInsElts = InsSubVT.getVectorMinNumElements(); + unsigned InsIdx = V.getConstantOperandVal(2); + unsigned NumSubElts = NVT.getVectorMinNumElements(); + if (InsIdx <= ExtIdx && (ExtIdx + NumSubElts) <= (InsIdx + NumInsElts) && + TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx) && + InsSubVT.isFixedLengthVector() && NVT.isFixedLengthVector()) + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, InsSub, + DAG.getVectorIdxConstant(ExtIdx - InsIdx, DL)); + } + // Try to move vector bitcast after extract_subv by scaling extraction index: // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') if (V.getOpcode() == ISD::BITCAST && diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 4242d8483e7233..39c7ce1413d1b3 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -314,8 +314,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -324,8 +324,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -981,7 +981,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -992,7 +992,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -3507,13 +3507,12 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3523,13 +3522,12 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3768,10 +3766,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -3784,10 +3782,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -4147,9 +4145,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -4161,9 +4159,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll index fbea08eb1e5502..04d7a9691b645f 100644 --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -26,7 +26,7 @@ define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll index 906fead7f8db53..a212f99680ef4d 100644 --- a/llvm/test/CodeGen/X86/dpbusd_i4.ll +++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll @@ -86,7 +86,7 @@ define i32 @mul_sext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) { ; CHECK-NEXT: vpsraw $12, %ymm0, %ymm0 ; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 1436922f9dd114..6d5fc9ed0ab5b6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -1828,22 +1828,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i16_stride3_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -1857,14 +1857,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -1885,21 +1885,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1908,22 +1906,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-FCP-LABEL: load_i16_stride3_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -1937,14 +1935,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -1965,21 +1963,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1988,22 +1984,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-LABEL: load_i16_stride3_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512DQ-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -2017,14 +2013,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -2045,21 +2041,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -2068,22 +2062,22 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7],ymm0[8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] @@ -2097,14 +2091,14 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -2125,21 +2119,19 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7],ymm8[8],ymm1[9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3500,688 +3492,668 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-LABEL: load_i16_stride3_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20 ; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm5 -; AVX512-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX512-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512-NEXT: vmovdqa %xmm2, %xmm14 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm12 +; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm21 +; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11 ; AVX512-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 -; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 +; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] -; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 -; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512-NEXT: vmovdqa64 %xmm8, %xmm25 ; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 -; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512-NEXT: vextracti32x4 $2, %zmm5, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride3_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21 +; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 ; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm25 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512-FCP-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride3_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm5 -; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX512DQ-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm14 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm12 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm21 +; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11 ; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512DQ-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 +; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512DQ-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512DQ-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm7 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm25 ; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512DQ-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512DQ-NEXT: vextracti32x4 $2, %zmm5, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-NEXT: vextracti32x4 $2, %zmm4, %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm21, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14],ymm3[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm8 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm8[2],xmm2[3,4],xmm8[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm13, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm17, %ymm10 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8],ymm15[9],ymm10[10,11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm22, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm13, %ymm24, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm17, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm13 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm17, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm10[1],xmm15[2,3],xmm10[4],xmm15[5,6],xmm10[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm5 -; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm6 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm23, %ymm9 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm9[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7],ymm3[8],ymm9[9,10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm20, %ymm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7],ymm5[8],ymm0[9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 8b6ba51506ab79..8091afbbfd70c3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1246,29 +1246,28 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] -; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero ; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] ; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax) -; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) -; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BW-FCP-NEXT: vmovq %xmm0, 48(%rax) +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] +; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] +; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm2, 48(%rax) +; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -1326,29 +1325,28 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero ; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] ; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax) -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 48(%rax) +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm2[1,5,9,13],zero,zero,zero,ymm2[2,6,10,14],zero,zero,zero,ymm2[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,5,0,5,1,3,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, 48(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 @@ -2053,77 +2051,76 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512-NEXT: vmovdqa (%r8), %xmm0 -; AVX512-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm3 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0 -; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[12,13,u,u,u],zero,zero,xmm6[14,15,u,u,u] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,12,13],zero,zero,xmm4[u,u,u,14,15],zero,zero,xmm4[u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] -; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,ymm4[u,u,u,10,2],zero,zero,ymm4[u,u,u,11,3],zero,zero,ymm4[u,u,u,20,28],zero,zero,ymm4[u,u,u,21,29],zero,zero,ymm4[u] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,3,3,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] -; AVX512-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,1,3] +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-NEXT: vmovdqa (%r9), %xmm4 +; AVX512-NEXT: vmovdqa (%r10), %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512-NEXT: vpternlogq $50, %ymm10, %ymm12, %ymm11 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] +; AVX512-NEXT: vpternlogq $200, %ymm11, %ymm12, %ymm13 +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512-NEXT: vporq %zmm10, %zmm11, %zmm10 +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512-NEXT: vpternlogq $200, %ymm11, %ymm13, %ymm12 +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512-NEXT: vpandn %ymm12, %ymm13, %ymm12 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512-NEXT: vporq %zmm12, %zmm11, %zmm11 +; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] +; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 -; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 -; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm4 -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512-NEXT: vpternlogq $50, %ymm6, %ymm8, %ymm7 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u,u],zero -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25] -; AVX512-NEXT: vpternlogq $200, %ymm6, %ymm8, %ymm7 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10],zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512-NEXT: vporq %zmm3, %zmm2, %zmm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512-NEXT: vpandn %ymm3, %ymm6, %ymm3 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,ymm0[u,u,u,u,u,5],zero,ymm0[u,u,u,u,u,6],zero,ymm0[u,u,u,u,u],zero,ymm0[23,u,u,u,u,u],zero,ymm0[24,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4,u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u] -; AVX512-NEXT: vpternlogq $200, %ymm3, %ymm6, %ymm7 -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa %xmm5, 96(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm7 +; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] +; AVX512-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -2131,70 +2128,69 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1],zero,zero,ymm2[u,u,u,10,2],zero,zero,ymm2[u,u,u,11,3],zero,zero,ymm2[u,u,u,20,28],zero,zero,ymm2[u,u,u,21,29],zero,zero,ymm2[u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u] -; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] -; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 -; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u],zero,zero,ymm6[1,5,u,u,u],zero,zero,ymm6[2,6,u,u,u],zero,zero,ymm6[19,23,u,u,u],zero,zero,ymm6[24,28,u,u,u],zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,0,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-FCP-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] +; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero +; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm11 +; AVX512-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] +; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] +; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21] +; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 +; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm9 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0 +; AVX512-FCP-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -2202,77 +2198,76 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2 -; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[12,13,u,u,u],zero,zero,xmm6[14,15,u,u,u] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,12,13],zero,zero,xmm4[u,u,u,14,15],zero,zero,xmm4[u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm4, %xmm4 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,xmm5[u,u,u,u,13,12],zero,xmm5[u,u,u,u,15,14],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[13,u,u,u,u],zero,zero,xmm1[14,u,u,u,u],zero,zero,xmm1[15] -; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[3,1,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[1],zero,zero,ymm4[u,u,u,10,2],zero,zero,ymm4[u,u,u,11,3],zero,zero,ymm4[u,u,u,20,28],zero,zero,ymm4[u,u,u,21,29],zero,zero,ymm4[u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[1,3,3,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] -; AVX512DQ-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,1,3] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm9[u,u,u,u,u,5],zero,ymm9[u,u,u,u,u,6],zero,ymm9[u,u,u,u,u],zero,ymm9[23,u,u,u,u,u],zero,ymm9[24,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u],zero,ymm11[5,u,u,u,u,u],zero,ymm11[6,u,u,u,u,u,23],zero,ymm11[u,u,u,u,u,24],zero,ymm11[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq $50, %ymm10, %ymm12, %ymm11 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm8[u,u,u,5],zero,ymm8[u,u,u,u,u,6],zero,ymm8[u,u,u,u,u],zero,ymm8[23,u,u,u,u,u],zero,ymm8[24,u,u,u,u,u],zero +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u],zero,ymm13[5,u,u,u,u,u],zero,ymm13[6,u,u,u,u,u,23],zero,ymm13[u,u,u,u,u,24],zero,ymm13[u,u,u,u,u,25] +; AVX512DQ-NEXT: vpternlogq $200, %ymm11, %ymm12, %ymm13 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,8],zero,zero,ymm11[u,u,u,1,9],zero,zero,ymm11[u,u,u,2,10],zero,zero,ymm11[u,u,u,19,27],zero,zero,ymm11[u,u,u,20,28],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512DQ-NEXT: vporq %zmm10, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[4],zero,ymm7[u,u,u,u,u,5],zero,ymm7[u,u,u,u,u,6],zero,ymm7[u,u,u,u,u],zero,ymm7[23,u,u,u,u,u],zero,ymm7[24,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,ymm12[4,u,u,u,u,u],zero,ymm12[5,u,u,u,u,u],zero,ymm12[6,u,u,u,u,u,23],zero,ymm12[u,u,u,u,u,24],zero,ymm12[u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512DQ-NEXT: vpternlogq $200, %ymm11, %ymm13, %ymm12 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8],zero,ymm11[u,u,u,u,1,9],zero,ymm11[u,u,u,u,18,26],zero,ymm11[u,u,u,u,19,27],zero,ymm11[u,u,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0] +; AVX512DQ-NEXT: vpandn %ymm12, %ymm13, %ymm12 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512DQ-NEXT: vporq %zmm12, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,ymm8[u,u,u,10,2],zero,zero,ymm8[u,u,u,11,3],zero,zero,ymm8[u,u,u,20,28],zero,zero,ymm8[u,u,u,21,29],zero,zero,ymm8[u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[1,9,u,u,u],zero,zero,ymm9[2,10,u,u,u],zero,zero,ymm9[3,19,u,u,u],zero,zero,ymm9[28,20,u,u,u],zero,zero,ymm9[29,21,u] +; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm7 -; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm7 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq $50, %ymm6, %ymm8, %ymm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u,u],zero,zero,ymm3[1,9,u,u,u],zero,zero,ymm3[18,26,u,u,u],zero,zero,ymm3[19,27,u,u,u],zero,zero,ymm3[20,28] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[u,u,u,5],zero,ymm2[u,u,u,u,u,6],zero,ymm2[u,u,u,u,u],zero,ymm2[23,u,u,u,u,u],zero,ymm2[24,u,u,u,u,u],zero -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u,u,25] -; AVX512DQ-NEXT: vpternlogq $200, %ymm6, %ymm8, %ymm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,ymm2[u,u,u,1,9],zero,zero,ymm2[u,u,u,2,10],zero,zero,ymm2[u,u,u,19,27],zero,zero,ymm2[u,u,u,20,28],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512DQ-NEXT: vporq %zmm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] -; AVX512DQ-NEXT: vpandn %ymm3, %ymm6, %ymm3 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[4],zero,ymm0[u,u,u,u,u,5],zero,ymm0[u,u,u,u,u,6],zero,ymm0[u,u,u,u,u],zero,ymm0[23,u,u,u,u,u],zero,ymm0[24,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4,u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u] -; AVX512DQ-NEXT: vpternlogq $200, %ymm3, %ymm6, %ymm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa %xmm5, 96(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512DQ-NEXT: vmovdqa %ymm4, 64(%rax) +; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm7 +; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm7 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] +; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2280,70 +2275,69 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[12,13,u,u,u],zero,zero,xmm4[14,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,12,13],zero,zero,xmm2[u,u,u,14,15],zero,zero,xmm2[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10],zero,xmm3[u,u,u,u,13,12],zero,xmm3[u,u,u,u,15,14],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[3,1,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1],zero,zero,ymm2[u,u,u,10,2],zero,zero,ymm2[u,u,u,11,3],zero,zero,ymm2[u,u,u,20,28],zero,zero,ymm2[u,u,u,21,29],zero,zero,ymm2[u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[1,3,3,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm4[1,9,u,u,u],zero,zero,ymm4[2,10,u,u,u],zero,zero,ymm4[3,19,u,u,u],zero,zero,ymm4[28,20,u,u,u],zero,zero,ymm4[29,21,u] -; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] -; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8,u,u,u],zero,zero,ymm4[1,9,u,u,u],zero,zero,ymm4[18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,5,2,6,1,5,2,6] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u],zero,zero,ymm6[1,5,u,u,u],zero,zero,ymm6[2,6,u,u,u],zero,zero,ymm6[19,23,u,u,u],zero,zero,ymm6[24,28,u,u,u],zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,5],zero,zero,ymm7[u,u,u,2,6],zero,zero,ymm7[u,u,u,19,23],zero,zero,ymm7[u,u,u,24,28],zero,zero,ymm7[u,u,u,25] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vporq %zmm4, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,0,1,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,0,8],zero,ymm6[u,u,u,u,1,9],zero,ymm6[u,u,u,u,18,26],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm9 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,ymm10[0,8,u,u,u],zero,zero,ymm10[1,9,u,u,u],zero,zero,ymm10[18,26,u,u,u],zero,zero,ymm10[19,27,u,u,u],zero,zero,ymm10[20,28] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,5,2,6,1,5,2,6] +; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm11, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u],zero,zero,ymm12[1,5,u,u,u],zero,zero,ymm12[2,6,u,u,u],zero,zero,ymm12[19,23,u,u,u],zero,zero,ymm12[24,28,u,u,u],zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,8],zero,zero,ymm12[u,u,u,1,9],zero,zero,ymm12[u,u,u,2,10],zero,zero,ymm12[u,u,u,19,27],zero,zero,ymm12[u,u,u,20,28],zero,zero +; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,1,5],zero,zero,ymm13[u,u,u,2,6],zero,zero,ymm13[u,u,u,19,23],zero,zero,ymm13[u,u,u,24,28],zero,zero,ymm13[u,u,u,25] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vporq %zmm10, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,0,1,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm9[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,0,8],zero,ymm13[u,u,u,u,1,9],zero,ymm13[u,u,u,u,18,26],zero,ymm13[u,u,u,u,19,27],zero,ymm13[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,4],zero,ymm11[u,u,u,u,1,5],zero,ymm11[u,u,u,u,2,6],zero,ymm11[u,u,u,u,19,23],zero,ymm11[u,u,u,u,24,28],zero,ymm11[u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,ymm7[u,u,u,10,2],zero,zero,ymm7[u,u,u,11,3],zero,zero,ymm7[u,u,u,20,28],zero,zero,ymm7[u,u,u,21,29],zero,zero,ymm7[u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,3,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,9],zero,ymm9[u,u,u,u,2,10],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u,20,28],zero,ymm9[u,u,u,u,21] +; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm9 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[12,13,u,u,u],zero,zero,xmm5[14,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13],zero,zero,xmm0[u,u,u,14,15],zero,zero,xmm0[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[13,u,u,u,u],zero,zero,xmm2[14,u,u,u,u],zero,zero,xmm2[15] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm9, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 96(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 11f422d671541a..99e8cdb179c8dc 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -314,8 +314,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -324,8 +324,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v ; ; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -981,7 +981,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -992,7 +992,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4026,10 +4026,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512F-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -4062,10 +4062,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512DQ-FAST: # %bb.0: -; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 ; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] ; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -4541,9 +4541,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 @@ -4559,9 +4559,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 From 71b9f6648222771470473431bc8ef2a2c25e872c Mon Sep 17 00:00:00 2001 From: Krystian Stasiowski Date: Tue, 16 Apr 2024 07:34:27 -0400 Subject: [PATCH 083/300] [clang][Index] Use canonical function parameter types in USRs (#68222) This is necessary to ensure that functions declared in different translation units whose parameter types only differ in top-level cv-qualification generate the same USR. For example: ``` // A.cpp void f(const int x); // c:@F@f#1I# // B.cpp void f(int x); // c:@F@f#I# ``` With this patch, the USR for both functions will be `c:@F@f#I#`. --- clang/lib/Index/USRGeneration.cpp | 9 ++++++--- clang/test/Index/USR/func-type.cpp | 12 ++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp index 5acc86191f8f9c..31c4a3345c09d1 100644 --- a/clang/lib/Index/USRGeneration.cpp +++ b/clang/lib/Index/USRGeneration.cpp @@ -267,10 +267,13 @@ void USRGenerator::VisitFunctionDecl(const FunctionDecl *D) { Out << '>'; } + QualType CanonicalType = D->getType().getCanonicalType(); // Mangle in type information for the arguments. - for (auto *PD : D->parameters()) { - Out << '#'; - VisitType(PD->getType()); + if (const auto *FPT = CanonicalType->getAs()) { + for (QualType PT : FPT->param_types()) { + Out << '#'; + VisitType(PT); + } } if (D->isVariadic()) Out << '.'; diff --git a/clang/test/Index/USR/func-type.cpp b/clang/test/Index/USR/func-type.cpp index ff1cd37a7fc421..459a8cd6da5584 100644 --- a/clang/test/Index/USR/func-type.cpp +++ b/clang/test/Index/USR/func-type.cpp @@ -16,3 +16,15 @@ void Func( void (* (*)(int, int))(int, int) ); // CHECK: {{[0-9]+}}:6 | function/C | Func | c:@F@Func#*F*Fv(#I#I)(#I#I)# | void Func( void (* (*)(int, int, int))(int) ); // CHECK: {{[0-9]+}}:6 | function/C | Func | c:@F@Func#*F*Fv(#I)(#I#I#I)# | + +// Functions with parameter types that only differ in top-level cv-qualification should generate the same USR. + +void f( const int ); +// CHECK: {{[0-9]+}}:6 | function/C | f | c:@F@f#I# | +void f( int ); +// CHECK: {{[0-9]+}}:6 | function/C | f | c:@F@f#I# | + +void g( int ); +// CHECK: {{[0-9]+}}:6 | function/C | g | c:@F@g#I# | +void g( const int ); +// CHECK: {{[0-9]+}}:6 | function/C | g | c:@F@g#I# | From dadcaf82274805456b7d85131cf94f921b5398b7 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 16 Apr 2024 12:54:01 +0100 Subject: [PATCH 084/300] [mlir][ArmSME] Support decomposing constant splats into ArmSME tiles (#88762) This adds a simple rewrite/legalization to decompose constant splats larger than a single ArmSME tile into multiple SME virtual tile sized splats. E.g. a constant splat to `vector<[8]x[8]xi32>` would decompose into four `vector<[4]x[4]xi32>` splats. --- .../ArmSME/Transforms/VectorLegalization.cpp | 32 ++++++++++++++++++- .../Dialect/ArmSME/vector-legalization.mlir | 11 +++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp index 31500c62c0d600..b595c6dd8a6848 100644 --- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp +++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp @@ -165,6 +165,35 @@ int getNumberOfSMETilesForVectorType(VectorType type) { return (vectorRows * vectorCols) / (minNumElts * minNumElts); } +/// Legalize `arith.constant dense` splat operations to fit within SME +/// tiles by decomposing them into tile-sized operations. +struct LegalizeArithConstantOpsByDecomposition + : public OneToNOpConversionPattern { + using OneToNOpConversionPattern::OneToNOpConversionPattern; + + LogicalResult + matchAndRewrite(arith::ConstantOp constantOp, OpAdaptor adaptor, + OneToNPatternRewriter &rewriter) const override { + auto vectorType = dyn_cast(constantOp.getType()); + auto denseAttr = dyn_cast(constantOp.getValueAttr()); + if (!vectorType || !denseAttr || !denseAttr.isSplat()) + return failure(); + + if (!isMultipleOfSMETileVectorType(vectorType)) + return rewriter.notifyMatchFailure(constantOp, + kMatchFailureNotSMETileTypeMultiple); + + auto smeTileType = getSMETileTypeForElement(vectorType.getElementType()); + auto tileCount = getNumberOfSMETilesForVectorType(vectorType); + auto tileSplat = rewriter.create( + constantOp.getLoc(), denseAttr.resizeSplat(smeTileType)); + rewriter.replaceOp(constantOp, SmallVector(tileCount, tileSplat), + adaptor.getResultMapping()); + + return success(); + } +}; + /// Legalize `vector.outerproduct` operations to fit within SME tiles by /// decomposing them into tile-sized operations. struct LegalizeVectorOuterProductOpsByDecomposition @@ -637,7 +666,8 @@ struct VectorLegalizationPass // Note: High benefit to ensure masked outer products are lowered first. patterns.add( converter, context, 1024); - patterns.add(converter, context); populateFuncTypeConversionPatterns(converter, patterns); diff --git a/mlir/test/Dialect/ArmSME/vector-legalization.mlir b/mlir/test/Dialect/ArmSME/vector-legalization.mlir index f8be697548c197..f43ef1cce787c5 100644 --- a/mlir/test/Dialect/ArmSME/vector-legalization.mlir +++ b/mlir/test/Dialect/ArmSME/vector-legalization.mlir @@ -433,3 +433,14 @@ func.func @lift_illegal_1d_shape_cast_to_memory(%a: index, %b: index, %memref: m %cast = vector.shape_cast %illegalRead : vector<[4]x1xf32> to vector<[4]xf32> return %cast : vector<[4]xf32> } + +// ----- + +// CHECK-LABEL: @multi_tile_splat +func.func @multi_tile_splat() -> vector<[8]x[8]xi32> +{ + // CHECK: %[[SPLAT:.*]] = arith.constant dense<42> : vector<[4]x[4]xi32> + // CHECK-NEXT: return %[[SPLAT]], %[[SPLAT]], %[[SPLAT]], %[[SPLAT]] : vector<[4]x[4]xi32>, vector<[4]x[4]xi32>, vector<[4]x[4]xi32>, vector<[4]x[4]xi32> + %0 = arith.constant dense<42> : vector<[8]x[8]xi32> + return %0 : vector<[8]x[8]xi32> +} From 711df7b0ae4a9ea45e431d5c0ff4a0c8b2e732c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Tue, 16 Apr 2024 12:57:59 +0100 Subject: [PATCH 085/300] [vector][mlir] Restrict vector.shuffle to fixed-width vectors (#88733) At the moment there is no support for vector.shuffle for scalable vectors - various hooks/helpers related to `vector.shuffle` simply ignore the scalable flags (e.g. ` ShuffleOp::inferReturnTypes`). This is unlikely to change any time soon (vector shuffles are known to be tricky for scalable vectors), hence this patch restricts `vector.shuffle` to fixed width vectors. --- mlir/include/mlir/Dialect/Vector/IR/VectorOps.td | 4 +++- mlir/test/Dialect/Vector/canonicalize.mlir | 8 -------- mlir/test/Dialect/Vector/invalid.mlir | 7 +++++++ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index 147bc2354977d7..332b5ad08ced98 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -420,7 +420,7 @@ def Vector_ShuffleOp : PredOpTrait<"second operand v2 and result have same element type", TCresVTEtIsSameAsOpBase<0, 1>>, InferTypeOpAdaptor]>, - Arguments<(ins AnyVectorOfAnyRank:$v1, AnyVectorOfAnyRank:$v2, + Arguments<(ins AnyFixedVector:$v1, AnyFixedVector:$v2, I64ArrayAttr:$mask)>, Results<(outs AnyVector:$vector)> { let summary = "shuffle operation"; @@ -444,6 +444,8 @@ def Vector_ShuffleOp : mask values must be within range, viz. given two k-D operands v1 and v2 above, all mask values are in the range [0,s_1+t_1) + Note, scalable vectors are not supported. + Example: ```mlir diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index 627ac54cf145bf..61a5f2a96e1c1c 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -1943,14 +1943,6 @@ func.func @shuffle_nofold1(%v0 : vector<4xi32>, %v1 : vector<2xi32>) -> vector<5 return %shuffle : vector<5xi32> } -// CHECK-LABEL: func @shuffle_nofold2 -// CHECK: %[[V:.+]] = vector.shuffle %arg0, %arg1 [0, 1, 2, 3] : vector<[4]xi32>, vector<[2]xi32> -// CHECK: return %[[V]] -func.func @shuffle_nofold2(%v0 : vector<[4]xi32>, %v1 : vector<[2]xi32>) -> vector<4xi32> { - %shuffle = vector.shuffle %v0, %v1 [0, 1, 2, 3] : vector<[4]xi32>, vector<[2]xi32> - return %shuffle : vector<4xi32> -} - // ----- // CHECK-LABEL: func @transpose_scalar_broadcast1 diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir index c16f1cb2876dbd..c9f7e9c6e2fb0b 100644 --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -84,6 +84,13 @@ func.func @shuffle_index_out_of_range(%arg0: vector<2xf32>, %arg1: vector<2xf32> // ----- +func.func @shuffle_scalable_vec(%arg0: vector<[2]xf32>, %arg1: vector<[2]xf32>) { + // expected-error@+1 {{'vector.shuffle' op operand #0 must be fixed-length vector of any type values}} + %1 = vector.shuffle %arg0, %arg1 [0, 1, 2, 3] : vector<[2]xf32>, vector<[2]xf32> +} + +// ----- + func.func @shuffle_empty_mask(%arg0: vector<2xf32>, %arg1: vector<2xf32>) { // expected-error@+1 {{'vector.shuffle' op invalid mask length}} %1 = vector.shuffle %arg0, %arg1 [] : vector<2xf32>, vector<2xf32> From 75244a1043d2be5003dea6914d5edc940c437cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 16 Apr 2024 09:44:54 +0200 Subject: [PATCH 086/300] [clang][Interp] Implement align builtins __builtin_is_aligned __builtin_is_align_up __builtin_is_align_down --- clang/lib/AST/Interp/InterpBuiltin.cpp | 118 +++++++++ clang/test/AST/Interp/builtin-align-cxx.cpp | 258 ++++++++++++++++++++ 2 files changed, 376 insertions(+) create mode 100644 clang/test/AST/Interp/builtin-align-cxx.cpp diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp index 984ba4f7f2689c..f562f9e1cb19fb 100644 --- a/clang/lib/AST/Interp/InterpBuiltin.cpp +++ b/clang/lib/AST/Interp/InterpBuiltin.cpp @@ -977,6 +977,117 @@ static bool interp__builtin_complex(InterpState &S, CodePtr OpPC, return true; } +/// __builtin_is_aligned() +/// __builtin_align_up() +/// __builtin_align_down() +/// The first parameter is either an integer or a pointer. +/// The second parameter is the requested alignment as an integer. +static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const Function *Func, + const CallExpr *Call) { + unsigned BuiltinOp = Func->getBuiltinID(); + unsigned CallSize = callArgSize(S, Call); + + PrimType AlignmentT = *S.Ctx.classify(Call->getArg(1)); + const APSInt &Alignment = peekToAPSInt(S.Stk, AlignmentT); + + if (Alignment < 0 || !Alignment.isPowerOf2()) { + S.FFDiag(Call, diag::note_constexpr_invalid_alignment) << Alignment; + return false; + } + unsigned SrcWidth = S.getCtx().getIntWidth(Call->getArg(0)->getType()); + APSInt MaxValue(APInt::getOneBitSet(SrcWidth, SrcWidth - 1)); + if (APSInt::compareValues(Alignment, MaxValue) > 0) { + S.FFDiag(Call, diag::note_constexpr_alignment_too_big) + << MaxValue << Call->getArg(0)->getType() << Alignment; + return false; + } + + // The first parameter is either an integer or a pointer (but not a function + // pointer). + PrimType FirstArgT = *S.Ctx.classify(Call->getArg(0)); + + if (isIntegralType(FirstArgT)) { + const APSInt &Src = peekToAPSInt(S.Stk, FirstArgT, CallSize); + APSInt Align = Alignment.extOrTrunc(Src.getBitWidth()); + if (BuiltinOp == Builtin::BI__builtin_align_up) { + APSInt AlignedVal = + APSInt((Src + (Align - 1)) & ~(Align - 1), Src.isUnsigned()); + pushInteger(S, AlignedVal, Call->getType()); + } else if (BuiltinOp == Builtin::BI__builtin_align_down) { + APSInt AlignedVal = APSInt(Src & ~(Align - 1), Src.isUnsigned()); + pushInteger(S, AlignedVal, Call->getType()); + } else { + assert(*S.Ctx.classify(Call->getType()) == PT_Bool); + S.Stk.push((Src & (Align - 1)) == 0); + } + return true; + } + + assert(FirstArgT == PT_Ptr); + const Pointer &Ptr = S.Stk.peek(CallSize); + + unsigned PtrOffset = Ptr.getByteOffset(); + PtrOffset = Ptr.getIndex(); + CharUnits BaseAlignment = + S.getCtx().getDeclAlign(Ptr.getDeclDesc()->asValueDecl()); + CharUnits PtrAlign = + BaseAlignment.alignmentAtOffset(CharUnits::fromQuantity(PtrOffset)); + + if (BuiltinOp == Builtin::BI__builtin_is_aligned) { + if (PtrAlign.getQuantity() >= Alignment) { + S.Stk.push(true); + return true; + } + // If the alignment is not known to be sufficient, some cases could still + // be aligned at run time. However, if the requested alignment is less or + // equal to the base alignment and the offset is not aligned, we know that + // the run-time value can never be aligned. + if (BaseAlignment.getQuantity() >= Alignment && + PtrAlign.getQuantity() < Alignment) { + S.Stk.push(false); + return true; + } + + S.FFDiag(Call->getArg(0), diag::note_constexpr_alignment_compute) + << Alignment; + return false; + } + + assert(BuiltinOp == Builtin::BI__builtin_align_down || + BuiltinOp == Builtin::BI__builtin_align_up); + + // For align_up/align_down, we can return the same value if the alignment + // is known to be greater or equal to the requested value. + if (PtrAlign.getQuantity() >= Alignment) { + S.Stk.push(Ptr); + return true; + } + + // The alignment could be greater than the minimum at run-time, so we cannot + // infer much about the resulting pointer value. One case is possible: + // For `_Alignas(32) char buf[N]; __builtin_align_down(&buf[idx], 32)` we + // can infer the correct index if the requested alignment is smaller than + // the base alignment so we can perform the computation on the offset. + if (BaseAlignment.getQuantity() >= Alignment) { + assert(Alignment.getBitWidth() <= 64 && + "Cannot handle > 64-bit address-space"); + uint64_t Alignment64 = Alignment.getZExtValue(); + CharUnits NewOffset = + CharUnits::fromQuantity(BuiltinOp == Builtin::BI__builtin_align_down + ? llvm::alignDown(PtrOffset, Alignment64) + : llvm::alignTo(PtrOffset, Alignment64)); + + S.Stk.push(Ptr.atIndex(NewOffset.getQuantity())); + return true; + } + + // Otherwise, we cannot constant-evaluate the result. + S.FFDiag(Call->getArg(0), diag::note_constexpr_alignment_adjust) << Alignment; + return false; +} + bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, const CallExpr *Call) { const InterpFrame *Frame = S.Current; @@ -1291,6 +1402,13 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, return false; break; + case Builtin::BI__builtin_is_aligned: + case Builtin::BI__builtin_align_up: + case Builtin::BI__builtin_align_down: + if (!interp__builtin_is_aligned_up_down(S, OpPC, Frame, F, Call)) + return false; + break; + default: S.FFDiag(S.Current->getLocation(OpPC), diag::note_invalid_subexpr_in_const_expr) diff --git a/clang/test/AST/Interp/builtin-align-cxx.cpp b/clang/test/AST/Interp/builtin-align-cxx.cpp new file mode 100644 index 00000000000000..62d73dba929b2c --- /dev/null +++ b/clang/test/AST/Interp/builtin-align-cxx.cpp @@ -0,0 +1,258 @@ +// C++-specific checks for the alignment builtins +// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -std=c++11 %s -fsyntax-only -verify=expected,both -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -std=c++11 %s -fsyntax-only -verify=ref,both + + +/// This is just a copy of the one from test/SemaCXX/ with some of the +/// diagnostic output adapted. +/// Also, align32array has an initializer now, which means it's not just +/// a dummy pointer for us and we do actually have type information for it. +/// In the future, we need to retain type information for dummy pointers as +/// well, so here is a test that will break once we do that: +namespace { + _Alignas(32) char heh[4]; + static_assert(!__builtin_is_aligned(&heh[1], 4), ""); // expected-error {{failed}} +} + + +// Check that we don't crash when using dependent types in __builtin_align: +template +void *c(void *d) { // both-note{{candidate template ignored}} + return __builtin_align_down(d, b); +} + +struct x {}; +x foo; +void test(void *value) { + c(value); + c(value); // both-error{{no matching function for call to 'c'}} +} + +template +void test_templated_arguments() { + T array[ArraySize]; // both-error{{variable has incomplete type 'fwddecl'}} + static_assert(__is_same(decltype(__builtin_align_up(array, Alignment)), T *), // both-error{{requested alignment is not a power of 2}} + "return type should be the decayed array type"); + static_assert(__is_same(decltype(__builtin_align_down(array, Alignment)), T *), + "return type should be the decayed array type"); + static_assert(__is_same(decltype(__builtin_is_aligned(array, Alignment)), bool), + "return type should be bool"); + T *x1 = __builtin_align_up(array, Alignment); + T *x2 = __builtin_align_down(array, Alignment); + bool x3 = __builtin_align_up(array, Alignment); +} + +void test() { + test_templated_arguments(); // fine + test_templated_arguments(); + // both-note@-1{{in instantiation of function template specialization 'test_templated_arguments'}} + // both-note@-2{{forward declaration of 'fwddecl'}} + test_templated_arguments(); // invalid alignment value + // both-note@-1{{in instantiation of function template specialization 'test_templated_arguments'}} +} + +template +void test_incorrect_alignment_without_instatiation(T value) { + int array[32]; + static_assert(__is_same(decltype(__builtin_align_up(array, 31)), int *), // both-error{{requested alignment is not a power of 2}} + "return type should be the decayed array type"); + static_assert(__is_same(decltype(__builtin_align_down(array, 7)), int *), // both-error{{requested alignment is not a power of 2}} + "return type should be the decayed array type"); + static_assert(__is_same(decltype(__builtin_is_aligned(array, -1)), bool), // both-error{{requested alignment must be 1 or greater}} + "return type should be bool"); + __builtin_align_up(array); // both-error{{too few arguments to function call, expected 2, have 1}} + __builtin_align_up(array, 31); // both-error{{requested alignment is not a power of 2}} + __builtin_align_down(array, 31); // both-error{{requested alignment is not a power of 2}} + __builtin_align_up(array, 31); // both-error{{requested alignment is not a power of 2}} + __builtin_align_up(value, 31); // This shouldn't want since the type is dependent + __builtin_align_up(value); // Same here + + __builtin_align_up(array, sizeof(sizeof(value)) - 1); // both-error{{requested alignment is not a power of 2}} + __builtin_align_up(array, value); // no diagnostic as the alignment is value dependent. + (void)__builtin_align_up(array, ArraySize); // The same above here +} + +// The original fix for the issue above broke some legitimate code. +// Here is a regression test: +typedef __SIZE_TYPE__ size_t; +void *allocate_impl(size_t size); +template +T *allocate() { + constexpr size_t allocation_size = + __builtin_align_up(sizeof(T), sizeof(void *)); + return static_cast( + __builtin_assume_aligned(allocate_impl(allocation_size), sizeof(void *))); +} +struct Foo { + int value; +}; +void *test2() { + return allocate(); +} + +// Check that pointers-to-members cannot be used: +class MemPtr { +public: + int data; + void func(); + virtual void vfunc(); +}; +void test_member_ptr() { + __builtin_align_up(&MemPtr::data, 64); // both-error{{operand of type 'int MemPtr::*' where arithmetic or pointer type is required}} + __builtin_align_down(&MemPtr::func, 64); // both-error{{operand of type 'void (MemPtr::*)()' where arithmetic or pointer type is required}} + __builtin_is_aligned(&MemPtr::vfunc, 64); // both-error{{operand of type 'void (MemPtr::*)()' where arithmetic or pointer type is required}} +} + +void test_references(Foo &i) { + // Check that the builtins look at the referenced type rather than the reference itself. + (void)__builtin_align_up(i, 64); // both-error{{operand of type 'Foo' where arithmetic or pointer type is required}} + (void)__builtin_align_up(static_cast(i), 64); // both-error{{operand of type 'Foo' where arithmetic or pointer type is required}} + (void)__builtin_align_up(static_cast(i), 64); // both-error{{operand of type 'const Foo' where arithmetic or pointer type is required}} + (void)__builtin_align_up(static_cast(i), 64); // both-error{{operand of type 'Foo' where arithmetic or pointer type is required}} + (void)__builtin_align_up(static_cast(i), 64); // both-error{{operand of type 'const Foo' where arithmetic or pointer type is required}} + (void)__builtin_align_up(&i, 64); +} + +// Check that constexpr wrapper functions can be constant-evaluated. +template +constexpr bool wrap_is_aligned(T ptr, long align) { + return __builtin_is_aligned(ptr, align); + // both-note@-1{{requested alignment -3 is not a positive power of two}} + // both-note@-2{{requested alignment 19 is not a positive power of two}} + // both-note@-3{{requested alignment must be 128 or less for type 'char'; 4194304 is invalid}} +} +template +constexpr T wrap_align_up(T ptr, long align) { + return __builtin_align_up(ptr, align); + // both-note@-1{{requested alignment -2 is not a positive power of two}} + // both-note@-2{{requested alignment 18 is not a positive power of two}} + // both-note@-3{{requested alignment must be 2147483648 or less for type 'int'; 8589934592 is invalid}} + // both-error@-4{{operand of type 'bool' where arithmetic or pointer type is required}} +} + +template +constexpr T wrap_align_down(T ptr, long align) { + return __builtin_align_down(ptr, align); + // both-note@-1{{requested alignment -1 is not a positive power of two}} + // both-note@-2{{requested alignment 17 is not a positive power of two}} + // both-note@-3{{requested alignment must be 32768 or less for type 'short'; 1048576 is invalid}} +} + +constexpr int a1 = wrap_align_up(22, 32); +static_assert(a1 == 32, ""); +constexpr int a2 = wrap_align_down(22, 16); +static_assert(a2 == 16, ""); +constexpr bool a3 = wrap_is_aligned(22, 32); +static_assert(!a3, ""); +static_assert(wrap_align_down(wrap_align_up(22, 16), 32) == 32, ""); +static_assert(wrap_is_aligned(wrap_align_down(wrap_align_up(22, 16), 32), 32), ""); +static_assert(!wrap_is_aligned(wrap_align_down(wrap_align_up(22, 16), 32), 64), ""); + +constexpr long const_value(long l) { return l; } +// Check some invalid values during constant-evaluation +static_assert(wrap_align_down(1, const_value(-1)), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{in call to}} +static_assert(wrap_align_up(1, const_value(-2)), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{in call to}} +static_assert(wrap_is_aligned(1, const_value(-3)), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{in call to}} +static_assert(wrap_align_down(1, const_value(17)), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{in call to}} +static_assert(wrap_align_up(1, const_value(18)), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{in call to}} +static_assert(wrap_is_aligned(1, const_value(19)), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{in call to}} + +// Check invalid values for smaller types: +static_assert(wrap_align_down(static_cast(1), const_value(1 << 20)), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{in call to }} +// Check invalid boolean type +static_assert(wrap_align_up(static_cast(1), const_value(1ull << 33)), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{in call to}} +static_assert(wrap_is_aligned(static_cast(1), const_value(1 << 22)), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{in call to}} + +// Check invalid boolean type +static_assert(wrap_align_up(static_cast(1), const_value(1 << 21)), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{in instantiation of function template specialization 'wrap_align_up' requested here}} + +// Check constant evaluation for pointers: +_Alignas(32) char align32array[128] = {}; +static_assert(&align32array[0] == &align32array[0], ""); +// __builtin_align_up/down can be constant evaluated as a no-op for values +// that are known to have greater alignment: +static_assert(__builtin_align_up(&align32array[0], 32) == &align32array[0], ""); +static_assert(__builtin_align_up(&align32array[0], 4) == &align32array[0], ""); +static_assert(__builtin_align_down(&align32array[0], 4) == __builtin_align_up(&align32array[0], 8), ""); +// But it can not be evaluated if the alignment is greater than the minimum +// known alignment, since in that case the value might be the same if it happens +// to actually be aligned to 64 bytes at run time. +static_assert(&align32array[0] == __builtin_align_up(&align32array[0], 64), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{cannot constant evaluate the result of adjusting alignment to 64}} +static_assert(__builtin_align_up(&align32array[0], 64) == __builtin_align_up(&align32array[0], 64), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{cannot constant evaluate the result of adjusting alignment to 64}} + +// However, we can compute in case the requested alignment is less than the +// base alignment: +static_assert(__builtin_align_up(&align32array[0], 4) == &align32array[0], ""); +static_assert(__builtin_align_up(&align32array[1], 4) == &align32array[4], ""); +static_assert(__builtin_align_up(&align32array[2], 4) == &align32array[4], ""); +static_assert(__builtin_align_up(&align32array[3], 4) == &align32array[4], ""); +static_assert(__builtin_align_up(&align32array[4], 4) == &align32array[4], ""); +static_assert(__builtin_align_up(&align32array[5], 4) == &align32array[8], ""); +static_assert(__builtin_align_up(&align32array[6], 4) == &align32array[8], ""); +static_assert(__builtin_align_up(&align32array[7], 4) == &align32array[8], ""); +static_assert(__builtin_align_up(&align32array[8], 4) == &align32array[8], ""); + +static_assert(__builtin_align_down(&align32array[0], 4) == &align32array[0], ""); +static_assert(__builtin_align_down(&align32array[1], 4) == &align32array[0], ""); +static_assert(__builtin_align_down(&align32array[2], 4) == &align32array[0], ""); +static_assert(__builtin_align_down(&align32array[3], 4) == &align32array[0], ""); +static_assert(__builtin_align_down(&align32array[4], 4) == &align32array[4], ""); +static_assert(__builtin_align_down(&align32array[5], 4) == &align32array[4], ""); +static_assert(__builtin_align_down(&align32array[6], 4) == &align32array[4], ""); +static_assert(__builtin_align_down(&align32array[7], 4) == &align32array[4], ""); +static_assert(__builtin_align_down(&align32array[8], 4) == &align32array[8], ""); + +// Achieving the same thing using casts to uintptr_t is not allowed: +static_assert((char *)((__UINTPTR_TYPE__)&align32array[7] & ~3) == &align32array[4], ""); // both-error{{not an integral constant expression}} \ + // expected-note {{cast that performs the conversions of a reinterpret_cast is not allowed in a constant expression}} + +static_assert(__builtin_align_down(&align32array[1], 4) == &align32array[0], ""); +static_assert(__builtin_align_down(&align32array[1], 64) == &align32array[0], ""); // both-error{{not an integral constant expression}} +// both-note@-1{{cannot constant evaluate the result of adjusting alignment to 64}} + +// Add some checks for __builtin_is_aligned: +static_assert(__builtin_is_aligned(&align32array[0], 32), ""); +static_assert(__builtin_is_aligned(&align32array[4], 4), ""); +// We cannot constant evaluate whether the array is aligned to > 32 since this +// may well be true at run time. +static_assert(!__builtin_is_aligned(&align32array[0], 64), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{cannot constant evaluate whether run-time alignment is at least 64}} + +// However, if the alignment being checked is less than the minimum alignment of +// the base object we can check the low bits of the alignment: +static_assert(__builtin_is_aligned(&align32array[0], 4), ""); +static_assert(!__builtin_is_aligned(&align32array[1], 4), ""); +static_assert(!__builtin_is_aligned(&align32array[2], 4), ""); +static_assert(!__builtin_is_aligned(&align32array[3], 4), ""); +static_assert(__builtin_is_aligned(&align32array[4], 4), ""); + +// TODO: this should evaluate to true even though we can't evaluate the result +// of __builtin_align_up() to a concrete value +static_assert(__builtin_is_aligned(__builtin_align_up(&align32array[0], 64), 64), ""); // both-error{{not an integral constant expression}} +// both-note@-1{{cannot constant evaluate the result of adjusting alignment to 64}} + +// Check different source and alignment type widths are handled correctly. +static_assert(!__builtin_is_aligned(static_cast(7), static_cast(4)), ""); +static_assert(!__builtin_is_aligned(static_cast(7), static_cast(4)), ""); +// Also check signed -- unsigned mismatch. +static_assert(!__builtin_is_aligned(static_cast(7), static_cast(4)), ""); +static_assert(!__builtin_is_aligned(static_cast(7), static_cast(4)), ""); +static_assert(!__builtin_is_aligned(static_cast(7), static_cast(4)), ""); +static_assert(!__builtin_is_aligned(static_cast(7), static_cast(4)), ""); +static_assert(!__builtin_is_aligned(static_cast(7), static_cast(4)), ""); +static_assert(!__builtin_is_aligned(static_cast(7), static_cast(4)), ""); + +// Check the diagnostic message +_Alignas(void) char align_void_array[1]; // both-error {{invalid application of '_Alignas' to an incomplete type 'void'}} From 91dd844aa499d69c7ff75bf3156e2e3593a88057 Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Tue, 16 Apr 2024 19:59:36 +0800 Subject: [PATCH 087/300] Recommit [RISCV] RISCV vector calling convention (2/2) (#79096) (#87736) Bug fix: Handle RVV return type in calling convention correctly. Return values are handled in a same way as function arguments. One thing to mention is that if a type can be broken down into homogeneous vector types, e.g. {, {, }}, it is considered as a vector tuple type and need to be handled by tuple type rule. --- llvm/lib/CodeGen/TargetLoweringBase.cpp | 12 +- .../Target/RISCV/GISel/RISCVCallLowering.cpp | 57 ++-- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 244 +++++++++++++++--- llvm/lib/Target/RISCV/RISCVISelLowering.h | 59 ++++- llvm/test/CodeGen/RISCV/rvv/calling-conv.ll | 203 +++++++++++++++ .../RISCV/rvv/vector-deinterleave-load.ll | 6 +- .../CodeGen/RISCV/rvv/vector-deinterleave.ll | 19 +- 7 files changed, 515 insertions(+), 85 deletions(-) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index f64ded4f2cf965..6e7b67ded23c84 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1809,8 +1809,16 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType, else if (attr.hasRetAttr(Attribute::ZExt)) Flags.setZExt(); - for (unsigned i = 0; i < NumParts; ++i) - Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isfixed=*/true, 0, 0)); + for (unsigned i = 0; i < NumParts; ++i) { + ISD::ArgFlagsTy OutFlags = Flags; + if (NumParts > 1 && i == 0) + OutFlags.setSplit(); + else if (i == NumParts - 1 && i != 0) + OutFlags.setSplitEnd(); + + Outs.push_back( + ISD::OutputArg(OutFlags, PartVT, VT, /*isfixed=*/true, 0, 0)); + } } } diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index 45e19cdea300b1..c18892ac62f247 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -34,14 +34,15 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { // Whether this is assigning args for a return. bool IsRet; - // true if assignArg has been called for a mask argument, false otherwise. - bool AssignedFirstMaskArg = false; + RVVArgDispatcher &RVVDispatcher; public: RISCVOutgoingValueAssigner( - RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet) + RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet, + RVVArgDispatcher &RVVDispatcher) : CallLowering::OutgoingValueAssigner(nullptr), - RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet) {} + RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet), + RVVDispatcher(RVVDispatcher) {} bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, @@ -51,16 +52,9 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { const DataLayout &DL = MF.getDataLayout(); const RISCVSubtarget &Subtarget = MF.getSubtarget(); - std::optional FirstMaskArgument; - if (Subtarget.hasVInstructions() && !AssignedFirstMaskArg && - ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) { - FirstMaskArgument = ValNo; - AssignedFirstMaskArg = true; - } - if (RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT, LocInfo, Flags, State, Info.IsFixed, IsRet, Info.Ty, - *Subtarget.getTargetLowering(), FirstMaskArgument)) + *Subtarget.getTargetLowering(), RVVDispatcher)) return true; StackSize = State.getStackSize(); @@ -181,14 +175,15 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner { // Whether this is assigning args from a return. bool IsRet; - // true if assignArg has been called for a mask argument, false otherwise. - bool AssignedFirstMaskArg = false; + RVVArgDispatcher &RVVDispatcher; public: RISCVIncomingValueAssigner( - RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet) + RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet, + RVVArgDispatcher &RVVDispatcher) : CallLowering::IncomingValueAssigner(nullptr), - RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet) {} + RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet), + RVVDispatcher(RVVDispatcher) {} bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, @@ -201,16 +196,9 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner { if (LocVT.isScalableVector()) MF.getInfo()->setIsVectorCall(); - std::optional FirstMaskArgument; - if (Subtarget.hasVInstructions() && !AssignedFirstMaskArg && - ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) { - FirstMaskArgument = ValNo; - AssignedFirstMaskArg = true; - } - if (RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT, LocInfo, Flags, State, /*IsFixed=*/true, IsRet, Info.Ty, - *Subtarget.getTargetLowering(), FirstMaskArgument)) + *Subtarget.getTargetLowering(), RVVDispatcher)) return true; StackSize = State.getStackSize(); @@ -420,9 +408,11 @@ bool RISCVCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder, SmallVector SplitRetInfos; splitToValueTypes(OrigRetInfo, SplitRetInfos, DL, CC); + RVVArgDispatcher Dispatcher{&MF, getTLI(), + ArrayRef(F.getReturnType())}; RISCVOutgoingValueAssigner Assigner( CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, - /*IsRet=*/true); + /*IsRet=*/true, Dispatcher); RISCVOutgoingValueHandler Handler(MIRBuilder, MF.getRegInfo(), Ret); return determineAndHandleAssignments(Handler, Assigner, SplitRetInfos, MIRBuilder, CC, F.isVarArg()); @@ -531,6 +521,7 @@ bool RISCVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, CallingConv::ID CC = F.getCallingConv(); SmallVector SplitArgInfos; + SmallVector TypeList; unsigned Index = 0; for (auto &Arg : F.args()) { // Construct the ArgInfo object from destination register and argument type. @@ -542,12 +533,16 @@ bool RISCVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, // correspondingly and appended to SplitArgInfos. splitToValueTypes(AInfo, SplitArgInfos, DL, CC); + TypeList.push_back(Arg.getType()); + ++Index; } + RVVArgDispatcher Dispatcher{&MF, getTLI(), + ArrayRef(TypeList)}; RISCVIncomingValueAssigner Assigner( CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, - /*IsRet=*/false); + /*IsRet=*/false, Dispatcher); RISCVFormalArgHandler Handler(MIRBuilder, MF.getRegInfo()); SmallVector ArgLocs; @@ -585,11 +580,13 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, SmallVector SplitArgInfos; SmallVector Outs; + SmallVector TypeList; for (auto &AInfo : Info.OrigArgs) { // Handle any required unmerging of split value types from a given VReg into // physical registers. ArgInfo objects are constructed correspondingly and // appended to SplitArgInfos. splitToValueTypes(AInfo, SplitArgInfos, DL, CC); + TypeList.push_back(AInfo.Ty); } // TODO: Support tail calls. @@ -607,9 +604,11 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); Call.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv)); + RVVArgDispatcher ArgDispatcher{&MF, getTLI(), + ArrayRef(TypeList)}; RISCVOutgoingValueAssigner ArgAssigner( CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, - /*IsRet=*/false); + /*IsRet=*/false, ArgDispatcher); RISCVOutgoingValueHandler ArgHandler(MIRBuilder, MF.getRegInfo(), Call); if (!determineAndHandleAssignments(ArgHandler, ArgAssigner, SplitArgInfos, MIRBuilder, CC, Info.IsVarArg)) @@ -637,9 +636,11 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, SmallVector SplitRetInfos; splitToValueTypes(Info.OrigRet, SplitRetInfos, DL, CC); + RVVArgDispatcher RetDispatcher{&MF, getTLI(), + ArrayRef(F.getReturnType())}; RISCVIncomingValueAssigner RetAssigner( CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV, - /*IsRet=*/true); + /*IsRet=*/true, RetDispatcher); RISCVCallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Call); if (!determineAndHandleAssignments(RetHandler, RetAssigner, SplitRetInfos, MIRBuilder, CC, Info.IsVarArg)) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1d1ea6bae6c105..765838aafb58d2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -18223,33 +18224,12 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1, return false; } -static unsigned allocateRVVReg(MVT ValVT, unsigned ValNo, - std::optional FirstMaskArgument, - CCState &State, const RISCVTargetLowering &TLI) { - const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT); - if (RC == &RISCV::VRRegClass) { - // Assign the first mask argument to V0. - // This is an interim calling convention and it may be changed in the - // future. - if (FirstMaskArgument && ValNo == *FirstMaskArgument) - return State.AllocateReg(RISCV::V0); - return State.AllocateReg(ArgVRs); - } - if (RC == &RISCV::VRM2RegClass) - return State.AllocateReg(ArgVRM2s); - if (RC == &RISCV::VRM4RegClass) - return State.AllocateReg(ArgVRM4s); - if (RC == &RISCV::VRM8RegClass) - return State.AllocateReg(ArgVRM8s); - llvm_unreachable("Unhandled register class for ValueType"); -} - // Implements the RISC-V calling convention. Returns true upon failure. bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI, - std::optional FirstMaskArgument) { + RVVArgDispatcher &RVVDispatcher) { unsigned XLen = DL.getLargestLegalIntTypeSizeInBits(); assert(XLen == 32 || XLen == 64); MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64; @@ -18418,7 +18398,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, else if (ValVT == MVT::f64 && !UseGPRForF64) Reg = State.AllocateReg(ArgFPR64s); else if (ValVT.isVector()) { - Reg = allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI); + Reg = RVVDispatcher.getNextPhysReg(); if (!Reg) { // For return values, the vector must be passed fully via registers or // via the stack. @@ -18504,9 +18484,15 @@ void RISCVTargetLowering::analyzeInputArgs( unsigned NumArgs = Ins.size(); FunctionType *FType = MF.getFunction().getFunctionType(); - std::optional FirstMaskArgument; - if (Subtarget.hasVInstructions()) - FirstMaskArgument = preAssignMask(Ins); + RVVArgDispatcher Dispatcher; + if (IsRet) { + Dispatcher = RVVArgDispatcher{&MF, this, ArrayRef(Ins)}; + } else { + SmallVector TypeList; + for (const Argument &Arg : MF.getFunction().args()) + TypeList.push_back(Arg.getType()); + Dispatcher = RVVArgDispatcher{&MF, this, ArrayRef(TypeList)}; + } for (unsigned i = 0; i != NumArgs; ++i) { MVT ArgVT = Ins[i].VT; @@ -18521,7 +18507,7 @@ void RISCVTargetLowering::analyzeInputArgs( RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this, - FirstMaskArgument)) { + Dispatcher)) { LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << ArgVT << '\n'); llvm_unreachable(nullptr); @@ -18535,9 +18521,13 @@ void RISCVTargetLowering::analyzeOutputArgs( CallLoweringInfo *CLI, RISCVCCAssignFn Fn) const { unsigned NumArgs = Outs.size(); - std::optional FirstMaskArgument; - if (Subtarget.hasVInstructions()) - FirstMaskArgument = preAssignMask(Outs); + SmallVector TypeList; + if (IsRet) + TypeList.push_back(MF.getFunction().getReturnType()); + else if (CLI) + for (const TargetLowering::ArgListEntry &Arg : CLI->getArgs()) + TypeList.push_back(Arg.Ty); + RVVArgDispatcher Dispatcher{&MF, this, ArrayRef(TypeList)}; for (unsigned i = 0; i != NumArgs; i++) { MVT ArgVT = Outs[i].VT; @@ -18547,7 +18537,7 @@ void RISCVTargetLowering::analyzeOutputArgs( RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this, - FirstMaskArgument)) { + Dispatcher)) { LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << ArgVT << "\n"); llvm_unreachable(nullptr); @@ -18728,7 +18718,7 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI, - std::optional FirstMaskArgument) { + RVVArgDispatcher &RVVDispatcher) { if (LocVT == MVT::i32 || LocVT == MVT::i64) { if (unsigned Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); @@ -18806,13 +18796,14 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, } if (LocVT.isVector()) { - if (unsigned Reg = - allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI)) { + MCPhysReg AllocatedVReg = RVVDispatcher.getNextPhysReg(); + if (AllocatedVReg) { // Fixed-length vectors are located in the corresponding scalable-vector // container types. if (ValVT.isFixedLengthVector()) LocVT = TLI.getContainerForFixedLengthVector(LocVT); - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + State.addLoc( + CCValAssign::getReg(ValNo, ValVT, AllocatedVReg, LocVT, LocInfo)); } else { // Try and pass the address via a "fast" GPR. if (unsigned GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) { @@ -19440,17 +19431,15 @@ bool RISCVTargetLowering::CanLowerReturn( SmallVector RVLocs; CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); - std::optional FirstMaskArgument; - if (Subtarget.hasVInstructions()) - FirstMaskArgument = preAssignMask(Outs); + RVVArgDispatcher Dispatcher{&MF, this, ArrayRef(Outs)}; for (unsigned i = 0, e = Outs.size(); i != e; ++i) { MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full, - ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr, - *this, FirstMaskArgument)) + ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, + nullptr, *this, Dispatcher)) return false; } return true; @@ -21247,6 +21236,181 @@ unsigned RISCVTargetLowering::getMinimumJumpTableEntries() const { return Subtarget.getMinimumJumpTableEntries(); } +// Handle single arg such as return value. +template +void RVVArgDispatcher::constructArgInfos(ArrayRef ArgList) { + // This lambda determines whether an array of types are constructed by + // homogeneous vector types. + auto isHomogeneousScalableVectorType = [](ArrayRef ArgList) { + // First, extract the first element in the argument type. + auto It = ArgList.begin(); + MVT FirstArgRegType = It->VT; + + // Return if there is no return or the type needs split. + if (It == ArgList.end() || It->Flags.isSplit()) + return false; + + ++It; + + // Return if this argument type contains only 1 element, or it's not a + // vector type. + if (It == ArgList.end() || !FirstArgRegType.isScalableVector()) + return false; + + // Second, check if the following elements in this argument type are all the + // same. + for (; It != ArgList.end(); ++It) + if (It->Flags.isSplit() || It->VT != FirstArgRegType) + return false; + + return true; + }; + + if (isHomogeneousScalableVectorType(ArgList)) { + // Handle as tuple type + RVVArgInfos.push_back({(unsigned)ArgList.size(), ArgList[0].VT, false}); + } else { + // Handle as normal vector type + bool FirstVMaskAssigned = false; + for (const auto &OutArg : ArgList) { + MVT RegisterVT = OutArg.VT; + + // Skip non-RVV register type + if (!RegisterVT.isVector()) + continue; + + if (RegisterVT.isFixedLengthVector()) + RegisterVT = TLI->getContainerForFixedLengthVector(RegisterVT); + + if (!FirstVMaskAssigned && RegisterVT.getVectorElementType() == MVT::i1) { + RVVArgInfos.push_back({1, RegisterVT, true}); + FirstVMaskAssigned = true; + continue; + } + + RVVArgInfos.push_back({1, RegisterVT, false}); + } + } +} + +// Handle multiple args. +template <> +void RVVArgDispatcher::constructArgInfos(ArrayRef TypeList) { + const DataLayout &DL = MF->getDataLayout(); + const Function &F = MF->getFunction(); + LLVMContext &Context = F.getContext(); + + bool FirstVMaskAssigned = false; + for (Type *Ty : TypeList) { + StructType *STy = dyn_cast(Ty); + if (STy && STy->containsHomogeneousScalableVectorTypes()) { + Type *ElemTy = STy->getTypeAtIndex(0U); + EVT VT = TLI->getValueType(DL, ElemTy); + MVT RegisterVT = + TLI->getRegisterTypeForCallingConv(Context, F.getCallingConv(), VT); + unsigned NumRegs = + TLI->getNumRegistersForCallingConv(Context, F.getCallingConv(), VT); + + RVVArgInfos.push_back( + {NumRegs * STy->getNumElements(), RegisterVT, false}); + } else { + SmallVector ValueVTs; + ComputeValueVTs(*TLI, DL, Ty, ValueVTs); + + for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues; + ++Value) { + EVT VT = ValueVTs[Value]; + MVT RegisterVT = + TLI->getRegisterTypeForCallingConv(Context, F.getCallingConv(), VT); + unsigned NumRegs = + TLI->getNumRegistersForCallingConv(Context, F.getCallingConv(), VT); + + // Skip non-RVV register type + if (!RegisterVT.isVector()) + continue; + + if (RegisterVT.isFixedLengthVector()) + RegisterVT = TLI->getContainerForFixedLengthVector(RegisterVT); + + if (!FirstVMaskAssigned && + RegisterVT.getVectorElementType() == MVT::i1) { + RVVArgInfos.push_back({1, RegisterVT, true}); + FirstVMaskAssigned = true; + --NumRegs; + } + + RVVArgInfos.insert(RVVArgInfos.end(), NumRegs, {1, RegisterVT, false}); + } + } + } +} + +void RVVArgDispatcher::allocatePhysReg(unsigned NF, unsigned LMul, + unsigned StartReg) { + assert((StartReg % LMul) == 0 && + "Start register number should be multiple of lmul"); + const MCPhysReg *VRArrays; + switch (LMul) { + default: + report_fatal_error("Invalid lmul"); + case 1: + VRArrays = ArgVRs; + break; + case 2: + VRArrays = ArgVRM2s; + break; + case 4: + VRArrays = ArgVRM4s; + break; + case 8: + VRArrays = ArgVRM8s; + break; + } + + for (unsigned i = 0; i < NF; ++i) + if (StartReg) + AllocatedPhysRegs.push_back(VRArrays[(StartReg - 8) / LMul + i]); + else + AllocatedPhysRegs.push_back(MCPhysReg()); +} + +/// This function determines if each RVV argument is passed by register, if the +/// argument can be assigned to a VR, then give it a specific register. +/// Otherwise, assign the argument to 0 which is a invalid MCPhysReg. +void RVVArgDispatcher::compute() { + uint32_t AssignedMap = 0; + auto allocate = [&](const RVVArgInfo &ArgInfo) { + // Allocate first vector mask argument to V0. + if (ArgInfo.FirstVMask) { + AllocatedPhysRegs.push_back(RISCV::V0); + return; + } + + unsigned RegsNeeded = divideCeil( + ArgInfo.VT.getSizeInBits().getKnownMinValue(), RISCV::RVVBitsPerBlock); + unsigned TotalRegsNeeded = ArgInfo.NF * RegsNeeded; + for (unsigned StartReg = 0; StartReg + TotalRegsNeeded <= NumArgVRs; + StartReg += RegsNeeded) { + uint32_t Map = ((1 << TotalRegsNeeded) - 1) << StartReg; + if ((AssignedMap & Map) == 0) { + allocatePhysReg(ArgInfo.NF, RegsNeeded, StartReg + 8); + AssignedMap |= Map; + return; + } + } + + allocatePhysReg(ArgInfo.NF, RegsNeeded, 0); + }; + + for (unsigned i = 0; i < RVVArgInfos.size(); ++i) + allocate(RVVArgInfos[i]); +} + +MCPhysReg RVVArgDispatcher::getNextPhysReg() { + assert(CurIdx < AllocatedPhysRegs.size() && "Index out of range"); + return AllocatedPhysRegs[CurIdx++]; +} + namespace llvm::RISCVVIntrinsicsTable { #define GET_RISCVVIntrinsicsTable_IMPL diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index e2633733c31b19..b10da3d40befb7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -24,6 +24,7 @@ namespace llvm { class InstructionCost; class RISCVSubtarget; struct RISCVRegisterInfo; +class RVVArgDispatcher; namespace RISCVISD { // clang-format off @@ -875,7 +876,7 @@ class RISCVTargetLowering : public TargetLowering { ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI, - std::optional FirstMaskArgument); + RVVArgDispatcher &RVVDispatcher); private: void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, @@ -1017,19 +1018,71 @@ class RISCVTargetLowering : public TargetLowering { unsigned getMinimumJumpTableEntries() const override; }; +/// As per the spec, the rules for passing vector arguments are as follows: +/// +/// 1. For the first vector mask argument, use v0 to pass it. +/// 2. For vector data arguments or rest vector mask arguments, starting from +/// the v8 register, if a vector register group between v8-v23 that has not been +/// allocated can be found and the first register number is a multiple of LMUL, +/// then allocate this vector register group to the argument and mark these +/// registers as allocated. Otherwise, pass it by reference and are replaced in +/// the argument list with the address. +/// 3. For tuple vector data arguments, starting from the v8 register, if +/// NFIELDS consecutive vector register groups between v8-v23 that have not been +/// allocated can be found and the first register number is a multiple of LMUL, +/// then allocate these vector register groups to the argument and mark these +/// registers as allocated. Otherwise, pass it by reference and are replaced in +/// the argument list with the address. +class RVVArgDispatcher { +public: + static constexpr unsigned NumArgVRs = 16; + + struct RVVArgInfo { + unsigned NF; + MVT VT; + bool FirstVMask = false; + }; + + template + RVVArgDispatcher(const MachineFunction *MF, const RISCVTargetLowering *TLI, + ArrayRef ArgList) + : MF(MF), TLI(TLI) { + constructArgInfos(ArgList); + compute(); + } + + RVVArgDispatcher() = default; + + MCPhysReg getNextPhysReg(); + +private: + SmallVector RVVArgInfos; + SmallVector AllocatedPhysRegs; + + const MachineFunction *MF = nullptr; + const RISCVTargetLowering *TLI = nullptr; + + unsigned CurIdx = 0; + + template void constructArgInfos(ArrayRef Ret); + void compute(); + void allocatePhysReg(unsigned NF = 1, unsigned LMul = 1, + unsigned StartReg = 0); +}; + namespace RISCV { bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI, - std::optional FirstMaskArgument); + RVVArgDispatcher &RVVDispatcher); bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed, bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI, - std::optional FirstMaskArgument); + RVVArgDispatcher &RVVDispatcher); bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll index 78e8700a9feff8..647d3158b6167f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll @@ -162,3 +162,206 @@ define void @caller_tuple_argument({, } %x) } declare void @callee_tuple_argument({, }) + +; %0 -> v8 +; %1 -> v9 +define @case1( %0, %1) { +; CHECK-LABEL: case1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %a = add %0, %1 + ret %a +} + +; %0 -> v8 +; %1 -> v10-v11 +; %2 -> v9 +define @case2_1( %0, %1, %2) { +; CHECK-LABEL: case2_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %a = add %0, %2 + ret %a +} +define @case2_2( %0, %1, %2) { +; CHECK-LABEL: case2_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vadd.vv v8, v10, v10 +; CHECK-NEXT: ret + %a = add %1, %1 + ret %a +} + +; %0 -> v8 +; %1 -> {v10-v11, v12-v13} +; %2 -> v9 +define @case3_1( %0, {, } %1, %2) { +; CHECK-LABEL: case3_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %add = add %0, %2 + ret %add +} +define @case3_2( %0, {, } %1, %2) { +; CHECK-LABEL: case3_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vadd.vv v8, v10, v12 +; CHECK-NEXT: ret + %a = extractvalue { , } %1, 0 + %b = extractvalue { , } %1, 1 + %add = add %a, %b + ret %add +} + +; %0 -> v8 +; %1 -> {by-ref, by-ref} +; %2 -> v9 +define @case4_1( %0, {, } %1, %2) { +; CHECK-LABEL: case4_1: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vl8re64.v v8, (a1) +; CHECK-NEXT: vl8re64.v v16, (a0) +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vadd.vv v8, v16, v8 +; CHECK-NEXT: ret + %a = extractvalue { , } %1, 0 + %b = extractvalue { , } %1, 1 + %add = add %a, %b + ret %add +} +define @case4_2( %0, {, } %1, %2) { +; CHECK-LABEL: case4_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %add = add %0, %2 + ret %add +} + +declare @callee1() +declare void @callee2() +declare void @callee3() +define void @caller() { +; RV32-LABEL: caller: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: call callee1 +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32-NEXT: vadd.vv v8, v8, v8 +; RV32-NEXT: call callee2 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: caller: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: call callee1 +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64-NEXT: vadd.vv v8, v8, v8 +; RV64-NEXT: call callee2 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = call @callee1() + %add = add %a, %a + call void @callee2( %add) + ret void +} + +declare {, } @callee_tuple() +define void @caller_tuple() { +; RV32-LABEL: caller_tuple: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: call callee_tuple +; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: call callee3 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: caller_tuple: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: call callee_tuple +; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: call callee3 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = call {, } @callee_tuple() + %b = extractvalue {, } %a, 0 + %c = extractvalue {, } %a, 1 + %add = add %b, %c + call void @callee3( %add) + ret void +} + +declare {, {, }} @callee_nested() +define void @caller_nested() { +; RV32-LABEL: caller_nested: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: call callee_nested +; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: call callee3 +; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: caller_nested: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: call callee_nested +; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: call callee3 +; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %a = call {, {, }} @callee_nested() + %b = extractvalue {, {, }} %a, 0 + %c = extractvalue {, {, }} %a, 1 + %c0 = extractvalue {, } %c, 0 + %c1 = extractvalue {, } %c, 1 + %add0 = add %b, %c0 + %add1 = add %add0, %c1 + call void @callee3( %add1) + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index a320aecc6fce49..6a712080fda74a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -18,10 +18,10 @@ define {, } @vector_deinterleave_load_nxv16i ; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 -; CHECK-NEXT: vnsrl.wi v8, v12, 0 -; CHECK-NEXT: vmsne.vi v0, v8, 0 -; CHECK-NEXT: vnsrl.wi v10, v12, 8 +; CHECK-NEXT: vnsrl.wi v10, v12, 0 ; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v10, v12, 8 +; CHECK-NEXT: vmsne.vi v9, v10, 0 ; CHECK-NEXT: ret %vec = load , ptr %p %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index ef4baf34d23f03..d98597fabcd953 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -8,18 +8,18 @@ define {, } @vector_deinterleave_nxv16i1_nxv ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v12, v8, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v0, a0 ; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma -; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 -; CHECK-NEXT: vnsrl.wi v12, v8, 0 -; CHECK-NEXT: vmsne.vi v0, v12, 0 -; CHECK-NEXT: vnsrl.wi v12, v8, 8 -; CHECK-NEXT: vmsne.vi v8, v12, 0 +; CHECK-NEXT: vmerge.vim v14, v8, 1, v0 +; CHECK-NEXT: vnsrl.wi v10, v12, 0 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v10, v12, 8 +; CHECK-NEXT: vmsne.vi v9, v10, 0 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) ret {, } %retval @@ -102,12 +102,13 @@ define {, } @vector_deinterleave_nxv64i1_nxv ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v28, v8, 0 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; CHECK-NEXT: vmsne.vi v0, v24, 0 +; CHECK-NEXT: vmsne.vi v7, v24, 0 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v24, v16, 8 ; CHECK-NEXT: vnsrl.wi v28, v8, 8 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma -; CHECK-NEXT: vmsne.vi v8, v24, 0 +; CHECK-NEXT: vmsne.vi v9, v24, 0 +; CHECK-NEXT: vmv1r.v v8, v7 ; CHECK-NEXT: ret %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv128i1( %vec) ret {, } %retval From ac1f2de7b581c26a768c4d2a2aad36505cc63c31 Mon Sep 17 00:00:00 2001 From: Frederik Harwath Date: Tue, 16 Apr 2024 14:09:25 +0200 Subject: [PATCH 088/300] [MLIR][docs] Mention declarePromisedInterface in Interfaces doc (#88689) Co-authored-by: Frederik Harwath Co-authored-by: Mehdi Amini --- mlir/docs/Interfaces.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mlir/docs/Interfaces.md b/mlir/docs/Interfaces.md index 536e7613e50936..51747db546bb76 100644 --- a/mlir/docs/Interfaces.md +++ b/mlir/docs/Interfaces.md @@ -299,6 +299,30 @@ owner of the dialect containing the object nor the owner of the interface are aware of an interface implementation, which can lead to duplicate or diverging implementations. +Forgetting to register an external model can lead to bugs which are hard to +track down. The `declarePromisedInterface` function can be used to declare that +an external model implementation for an operation must eventually be provided. + +``` + void MyDialect::initialize() { + declarePromisedInterface(); + ... + } +``` + +Now attempting to use the interface, e.g in a cast, without a prior registration +of the external model will lead to a runtime error that will look similar to +this: + +``` +LLVM ERROR: checking for an interface (`SomeInterface`) that was promised by dialect 'mydialect' but never implemented. This is generally an indication that the dialect extension implementing the interface was never registered. +``` + +If you encounter this error for a dialect and an interface provided by MLIR, you +may look for a method that will be named like +`registerExternalModels(DialectRegistry ®istry);` ; try +to find it with `git grep 'register.*SomeInterface.*Model' mlir`. + #### Dialect Fallback for OpInterface Some dialects have an open ecosystem and don't register all of the possible From 1c076b43c294c7d29d99dd50f6853b33a5b99789 Mon Sep 17 00:00:00 2001 From: Spenser Bauman Date: Tue, 16 Apr 2024 08:14:22 -0400 Subject: [PATCH 089/300] [mlir][tosa] Implement dynamic shape support for tosa.max_pool2d lowering (#87538) The existing lowering for tosa.max_pool2d only supports dynamic dimensions when the dynamic dimension is the batch dimension. This change updates the lowering to support arbitrary dynamic dimensions on the inputs and outputs of the tosa.max_pool2d operation. This change also fixes a bug in the implementation of implicit broadcasting in the tosa-to-linalg pass, which was introducing uses of constant ops that violated dominance requirements. --- .../mlir/Dialect/Tosa/IR/TosaTypesBase.td | 10 +- .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 12 +- .../TosaToLinalg/TosaToLinalgNamed.cpp | 88 ++++++++++---- .../TosaToLinalg/tosa-to-linalg-named.mlir | 54 +++++++++ .../TosaToLinalg/tosa-to-linalg.mlir | 12 +- .../Tosa/CPU/test-maxpool-dynamic.mlir | 112 ++++++++++++++++++ 6 files changed, 251 insertions(+), 37 deletions(-) create mode 100644 mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td index cff3de0a69af95..3687891fe4b7cf 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td @@ -130,11 +130,11 @@ def Tosa_ScalarTensor : TensorRankOf<[Tosa_AnyNumber], [0]>; // to not include any remaining unranked tensors. def Tosa_UnrankedTensor : UnrankedTensorOf<[Tosa_AnyNumber]>; -def Tosa_Tensor1D : AnyTypeOf<[Tosa_UnrankedTensor, 1DTensorOf<[Tosa_AnyNumber]>]>; -def Tosa_Tensor2D : AnyTypeOf<[Tosa_UnrankedTensor, 2DTensorOf<[Tosa_AnyNumber]>]>; -def Tosa_Tensor3D : AnyTypeOf<[Tosa_UnrankedTensor, 3DTensorOf<[Tosa_AnyNumber]>]>; -def Tosa_Tensor4D : AnyTypeOf<[Tosa_UnrankedTensor, 4DTensorOf<[Tosa_AnyNumber]>]>; -def Tosa_Tensor5D : AnyTypeOf<[Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [5]>]>; +def Tosa_Tensor1D : AnyTypeOf<[Tosa_UnrankedTensor, 1DTensorOf<[Tosa_AnyNumber]>], "1-d tensor", "::mlir::TensorType">; +def Tosa_Tensor2D : AnyTypeOf<[Tosa_UnrankedTensor, 2DTensorOf<[Tosa_AnyNumber]>], "2-d tensor", "::mlir::TensorType">; +def Tosa_Tensor3D : AnyTypeOf<[Tosa_UnrankedTensor, 3DTensorOf<[Tosa_AnyNumber]>], "3-d tensor", "::mlir::TensorType">; +def Tosa_Tensor4D : AnyTypeOf<[Tosa_UnrankedTensor, 4DTensorOf<[Tosa_AnyNumber]>], "4-d tensor", "::mlir::TensorType">; +def Tosa_Tensor5D : AnyTypeOf<[Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [5]>], "5-d tensor", "::mlir::TensorType">; // Ranked tensors up to given rank. def Tosa_Tensor1Dto4D : AnyTypeOf<[ diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 7c477f2e1412be..d8dd1c93722b09 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -766,11 +766,15 @@ static Value broadcastDynamicDimension(PatternRewriter &rewriter, Location loc, // Emit 'then' region of 'scf.if' auto emitThenRegion = [&](OpBuilder &opBuilder, Location loc) { + // It is not safe to cache constants across regions. + // New constants could potentially violate dominance requirements. + IndexPool localPool; + // Emit 'tensor.empty' op SmallVector outputTensorShape; for (auto index : llvm::seq(0, rank)) { auto size = index == dim ? targetSize - : getOrFoldTensorDim(rewriter, loc, indexPool, + : getOrFoldTensorDim(rewriter, loc, localPool, operand, index); outputTensorShape.push_back(size); } @@ -812,9 +816,9 @@ static Value broadcastDynamicDimensions(PatternRewriter &rewriter, Location loc, IndexPool &indexPool, Value operand, ArrayRef targetShape, ArrayRef masterOperands) { - size_t rank = operand.getType().cast().getRank(); - assert(targetShape.size() == rank); - assert(masterOperands.size() == rank); + int64_t rank = operand.getType().cast().getRank(); + assert((int64_t)targetShape.size() == rank); + assert((int64_t)masterOperands.size() == rank); for (auto index : llvm::seq(0, rank)) operand = broadcastDynamicDimension(rewriter, loc, indexPool, operand, index, diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp index 3f39cbf03a9a80..8fb8d16486560c 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp @@ -26,6 +26,8 @@ #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Interfaces/InferTypeOpInterface.h" + #include #include @@ -34,7 +36,7 @@ using namespace mlir::tosa; static mlir::Value applyPad(Location loc, Value input, ArrayRef pad, TypedAttr padAttr, OpBuilder &rewriter) { - // Input should be padded if necessary. + // Input should be padded only if necessary. if (llvm::all_of(pad, [](int64_t p) { return p == 0; })) return input; @@ -47,7 +49,7 @@ static mlir::Value applyPad(Location loc, Value input, ArrayRef pad, SmallVector paddedShape; SmallVector lowIndices; SmallVector highIndices; - for (int i = 0, s = inputShape.size(); i < s; i++) { + for (size_t i : llvm::seq(inputShape.size())) { auto lowPad = pad[i * 2]; auto highPad = pad[i * 2 + 1]; if (ShapedType::isDynamic(inputShape[i])) @@ -131,20 +133,19 @@ static mlir::Value linalgBroadcastAndMaybeExtSI(PatternRewriter &rewriter, static mlir::Value reifyConstantDim(int64_t attr, ImplicitLocOpBuilder &builder) { - return builder.createOrFold( - builder.getIndexType(), - builder.create(builder.getI64IntegerAttr(attr))); + return builder.create(attr); } // Calculating the output width/height using the formula: // H = ((IH+pad_top+pad_bottom-(dilation_y*(KH-1)+1))/stride_y)+1 // W = ((IW+pad_left+pad_right-(dilation_x*(KW-1)+1))/stride_x)+1 -static mlir::Value getConvOutputDim(Location loc, Value inputDim, - int64_t padBeforeAttr, int64_t padAfterAttr, - Value kernelDim, int64_t strideAttr, - int64_t dilationAttr, Type inputETy, - OpBuilder &rewriter) { +static mlir::Value getConvOrPoolOutputDim(Location loc, Value inputDim, + int64_t padBeforeAttr, + int64_t padAfterAttr, Value kernelDim, + int64_t strideAttr, + int64_t dilationAttr, + OpBuilder &rewriter) { ImplicitLocOpBuilder builder(loc, rewriter); auto one = rewriter.create( loc, IntegerAttr::get(inputDim.getType(), 1)); @@ -171,7 +172,6 @@ static SmallVector inferDynamicDimsForConv( ArrayRef dilationAttr, ArrayRef inputSizeDims, ArrayRef kernelSizeDims, OpBuilder &rewriter) { ShapedType inputTy = cast(input.getType()); - Type inputETy = inputTy.getElementType(); int64_t inputRank = inputTy.getRank(); SmallVector dynDims; @@ -190,8 +190,8 @@ static SmallVector inferDynamicDimsForConv( rewriter.create(loc, weight, kernelDim); // H = F(IH, pad_top, pad_bottom, dilation_y, KH, stride_y) dynDims[inputDim] = - getConvOutputDim(loc, initDynDim, padTop, padBottom, kernelDynDim, - stride, dilation, inputETy, rewriter); + getConvOrPoolOutputDim(loc, initDynDim, padTop, padBottom, + kernelDynDim, stride, dilation, rewriter); } } @@ -685,20 +685,61 @@ class MaxPool2dConverter : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; + // Compute the dynamic output sizes of the maxpool operation. + static SmallVector + computeDynamicOutputSizes(tosa::MaxPool2dOp op, PatternRewriter &rewriter) { + TensorType resultTy = op.getType(); + Location loc = op.getLoc(); + + TypedValue input = op.getInput(); + ArrayRef kernel = op.getKernel(); + ArrayRef pad = op.getPad(); + ArrayRef stride = op.getStride(); + + SmallVector dynamicDims; + + // Batch dimension + if (resultTy.isDynamicDim(0)) + dynamicDims.push_back(rewriter.create(loc, input, 0)); + + // Height/width dimensions + for (int64_t dim : {1, 2}) { + if (!resultTy.isDynamicDim(dim)) + continue; + + // Index into the attribute arrays + int64_t index = dim - 1; + + // Input height/width + Value ihw = rewriter.create(loc, input, dim); + + // Kernel height/width + Value khw = rewriter.create(loc, kernel[index]); + + // Output height/width + Value ohw = getConvOrPoolOutputDim(loc, ihw, pad[index * 2], + pad[index * 2 + 1], khw, stride[index], + /*dilationAttr=*/1, rewriter); + dynamicDims.push_back(ohw); + } + + // Channel dimension + if (resultTy.isDynamicDim(3)) + dynamicDims.push_back(rewriter.create(loc, input, 3)); + + return dynamicDims; + } + LogicalResult matchAndRewrite(tosa::MaxPool2dOp op, PatternRewriter &rewriter) const final { Location loc = op.getLoc(); - Value input = op.getInput(); - ShapedType inputTy = cast(input.getType()); + TypedValue input = op.getInput(); + ShapedType inputTy = input.getType(); - ShapedType resultTy = cast(op.getType()); + ShapedType resultTy = op.getType(); Type resultETy = inputTy.getElementType(); - auto dynamicDimsOr = - checkHasDynamicBatchDims(rewriter, op, {input, op.getOutput()}); - if (!dynamicDimsOr.has_value()) - return failure(); - SmallVector dynamicDims = *dynamicDimsOr; + SmallVector dynamicDims = computeDynamicOutputSizes(op, rewriter); // Determine what the initial value needs to be for the max pool op. TypedAttr initialAttr; @@ -721,6 +762,7 @@ class MaxPool2dConverter : public OpRewritePattern { pad.resize(2, 0); llvm::append_range(pad, op.getPad()); pad.resize(pad.size() + 2, 0); + Value paddedInput = applyPad(loc, input, pad, initialAttr, rewriter); Value initialValue = rewriter.create(loc, initialAttr); @@ -736,9 +778,7 @@ class MaxPool2dConverter : public OpRewritePattern { loc, resultTy.getShape(), resultTy.getElementType(), dynamicDims); Value filledEmptyTensor = - rewriter - .create(loc, ValueRange{initialValue}, - ValueRange{emptyTensor}) + rewriter.create(loc, initialValue, emptyTensor) .result(); Value fakeWindowDims = diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir index e64903671e599f..b4049000c50dc8 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir @@ -1,5 +1,6 @@ // RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named))" %s -verify-diagnostics -o -| FileCheck %s // RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named{prefer-conv2d-kernel-layout-hwcf=true}))" %s -verify-diagnostics -o -| FileCheck --check-prefix="HWCF" %s +// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,cse))" %s -verify-diagnostics -o -| FileCheck --check-prefix="CHECK-CSE" %s // CHECK-LABEL: @matmul func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) { @@ -215,6 +216,59 @@ func.func @max_pool_i32(%arg0: tensor<1x6x34x62xi32>) -> () { return } +// CHECK-CSE-LABEL: @max_pool_all_dynamic +func.func @max_pool_all_dynamic(%arg0: tensor) -> tensor { + // Batch size + // CHECK-CSE: %[[C0:.+]] = arith.constant 0 : index + // CHECK-CSE: %[[BATCH:.+]] = tensor.dim %arg0, %[[C0]] : tensor + + // Compute output height + // CHECK-CSE: %[[C1:.+]] = arith.constant 1 : index + // CHECK-CSE: %[[IH:.+]] = tensor.dim %arg0, %[[C1]] : tensor + // CHECK-CSE: %[[C2:.+]] = arith.constant 2 : index + // CHECK-CSE: %[[PADDED_BEFORE:.+]] = arith.addi %[[IH]], %[[C0]] : index + // CHECK-CSE: %[[PADDED_AFTER:.+]] = arith.addi %[[PADDED_BEFORE]], %[[C0]] : index + // CHECK-CSE: %[[SUB_ONE:.+]] = arith.subi %[[C2]], %[[C1]] : index + // CHECK-CSE: %[[DILATED:.+]] = arith.muli %[[C1]], %[[SUB_ONE]] : index + // CHECK-CSE: %[[ADD_ONE:.+]] = arith.addi %[[DILATED]], %[[C1]] : index + // CHECK-CSE: %[[SUBTRACT:.+]] = arith.subi %[[PADDED_AFTER]], %[[ADD_ONE]] : index + // CHECK-CSE: %[[DIVIDE:.+]] = arith.divui %[[SUBTRACT]], %[[C1]] : index + // CHECK-CSE: %[[HEIGHT:.+]] = arith.addi %[[DIVIDE]], %[[C1]] : index + + // Compute output width + // CHECK-CSE: %[[IW:.+]] = tensor.dim %arg0, %[[C2]] : tensor + // CHECK-CSE: %[[C5:.+]] = arith.constant 5 : index + // CHECK-CSE: %[[PADDED_BEFORE:.+]] = arith.addi %[[IW]], %[[C2]] : index + // CHECK-CSE: %[[PADDED_AFTER:.+]] = arith.addi %[[PADDED_BEFORE]], %[[C2]] : index + // CHECK-CSE: %[[SUB_ONE:.+]] = arith.subi %[[C5]], %[[C1]] : index + // CHECK-CSE: %[[DILATED:.+]] = arith.muli %[[C1]], %[[SUB_ONE]] : index + // CHECK-CSE: %[[ADD_ONE:.+]] = arith.addi %[[DILATED]], %[[C1]] : index + // CHECK-CSE: %[[SUBTRACT:.+]] = arith.subi %[[PADDED_AFTER]], %[[ADD_ONE]] : index + // CHECK-CSE: %[[DIVIDE:.+]] = arith.divui %[[SUBTRACT]], %[[C1]] : index + // CHECK-CSE: %[[WIDTH:.+]] = arith.addi %14, %[[C1]] : index + + // Channel size + // CHECK-CSE: %[[C3:.+]] = arith.constant 3 : index + // CHECK-CSE: %[[CHANNEL:.+]] = tensor.dim %arg0, %[[C3]] : tensor + + // Pad the input + // CHECK-CSE: %[[FLOAT_MIN:.+]] = arith.constant -3.40282347E+38 : f32 + // CHECK-CSE: %[[PADDED:.+]] = tensor.pad %arg0 low[0, 0, 2, 0] high[0, 0, 2, 0] { + // CHECK-CSE: tensor.yield %[[FLOAT_MIN]] : f32 + + // Allocate the output and fill with minimum value + // CHECK-CSE: %[[INIT:.+]] = tensor.empty(%[[BATCH]], %[[HEIGHT]], %[[WIDTH]], %[[CHANNEL]]) : tensor + // CHECK-CSE: %[[FILL:.+]] = linalg.fill ins(%[[FLOAT_MIN]] : f32) outs(%[[INIT]] : tensor) -> tensor + // CHECK-CSE: %[[FAKE_WINDOW:.+]] = tensor.empty() : tensor<2x5xf32> + + // Compute max pool + // CHECK-CSE: %[[OUT:.+]] = linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%[[PADDED]], %[[FAKE_WINDOW]] : tensor, tensor<2x5xf32>) outs(%[[FILL]] : tensor) -> tensor + // CHECK-CSE: return %[[OUT]] + + %0 = tosa.max_pool2d %arg0 {kernel = array, pad = array, stride = array} : (tensor) -> tensor + return %0 : tensor +} + // ----- // CHECK-LABEL: @avg_pool_f32 diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index 1fa783f05f04ee..445e8be47678d5 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -270,7 +270,8 @@ func.func @test_add_2d_all_dynamic(%arg0: tensor, %arg1: tensor // CHECK: %[[VAL_1:.*]] = arith.cmpi eq, %[[VAL_0]], %[[CONST1]] : index // CHECK: %[[ARG0_DIM0_BROADCAST:.*]] = scf.if %[[VAL_1]] -> (tensor) { - // CHECK: %[[VAL_2:.*]] = tensor.dim %[[ARG0]], %[[CONST1]] : tensor + // CHECK: %[[LOCAL_CONST1:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_2:.*]] = tensor.dim %[[ARG0]], %[[LOCAL_CONST1]] : tensor // CHECK: %[[VAL_3:.*]] = tensor.empty(%[[MAX_DIM0]], %[[VAL_2]]) : tensor // CHECK: %[[VAL_4:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG0]] : tensor) outs(%[[VAL_3]] : tensor) { // CHECK: ^bb0(%[[VAL_5:.*]]: f32, %[[VAL_6:.*]]: f32): @@ -284,7 +285,8 @@ func.func @test_add_2d_all_dynamic(%arg0: tensor, %arg1: tensor // CHECK: %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_7]], %[[CONST1]] : index // CHECK: %[[ARG0_DIM1_BROADCAST:.*]] = scf.if %[[VAL_8]] -> (tensor) { - // CHECK: %[[VAL_9:.*]] = tensor.dim %[[ARG0_DIM0_BROADCAST]], %[[CONST0]] : tensor + // CHECK: %[[LOCAL_CONST0:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_9:.*]] = tensor.dim %[[ARG0_DIM0_BROADCAST]], %[[LOCAL_CONST0]] : tensor // CHECK: %[[VAL_10:.*]] = tensor.empty(%[[VAL_9]], %[[MAX_DIM1]]) : tensor // CHECK: %[[VAL_11:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG0_DIM0_BROADCAST]] : tensor) outs(%[[VAL_10]] : tensor) { // CHECK: ^bb0(%[[VAL_12:.*]]: f32, %[[VAL_13:.*]]: f32): @@ -298,7 +300,8 @@ func.func @test_add_2d_all_dynamic(%arg0: tensor, %arg1: tensor // CHECK: %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[CONST1]] : index // CHECK: %[[ARG1_DIM0_BROADCAST:.*]] = scf.if %[[VAL_15]] -> (tensor) { - // CHECK: %[[VAL_16:.*]] = tensor.dim %[[ARG1]], %[[CONST1]] : tensor + // CHECK: %[[LOCAL_CONST1:.*]] = arith.constant 1 : index + // CHECK: %[[VAL_16:.*]] = tensor.dim %[[ARG1]], %[[LOCAL_CONST1]] : tensor // CHECK: %[[VAL_17:.*]] = tensor.empty(%[[MAX_DIM0]], %[[VAL_16]]) : tensor // CHECK: %[[VAL_18:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG1]] : tensor) outs(%[[VAL_17]] : tensor) { // CHECK: ^bb0(%[[VAL_19:.*]]: f32, %[[VAL_20:.*]]: f32): @@ -312,7 +315,8 @@ func.func @test_add_2d_all_dynamic(%arg0: tensor, %arg1: tensor // CHECK: %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[CONST1]] : index // CHECK: %[[ARG1_DIM1_BROADCAST:.*]] = scf.if %[[VAL_22]] -> (tensor) { - // CHECK: %[[VAL_23:.*]] = tensor.dim %[[ARG1_DIM0_BROADCAST]], %[[CONST0]] : tensor + // CHECK: %[[LOCAL_CONST0:.*]] = arith.constant 0 : index + // CHECK: %[[VAL_23:.*]] = tensor.dim %[[ARG1_DIM0_BROADCAST]], %[[LOCAL_CONST0]] : tensor // CHECK: %[[VAL_24:.*]] = tensor.empty(%[[VAL_23]], %[[MAX_DIM1]]) : tensor // CHECK: %[[VAL_25:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG1_DIM0_BROADCAST]] : tensor) outs(%[[VAL_24]] : tensor) { // CHECK: ^bb0(%[[VAL_26:.*]]: f32, %[[VAL_27:.*]]: f32): diff --git a/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir b/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir new file mode 100644 index 00000000000000..05a78e32b9e115 --- /dev/null +++ b/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir @@ -0,0 +1,112 @@ +// DEFINE: %{tosa-to-linalg-pipeline} = -pass-pipeline="builtin.module(func.func(tosa-infer-shapes,tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith))" + +// RUN: mlir-opt %s \ +// RUN: %{tosa-to-linalg-pipeline} \ +// RUN: | mlir-opt \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ +// RUN: -buffer-deallocation-pipeline \ +// RUN: -test-lower-to-llvm \ +// RUN: | mlir-cpu-runner \ +// RUN: -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \ +// RUN: | FileCheck %s + +// Validate that the TOSA lowering for tosa.max_pool2d produces the same results when +// for fully static and fully dynamic inputs. + +!tensor_type = tensor<1x4x4x1xf32> +!memref_type = memref<1x4x4x1xf32> + +// Utility functions +func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface } + +func.func @max_pool_static(%arg0: !tensor_type) -> (!tensor_type) { + %0 = tosa.max_pool2d %arg0 { + pad = array, + kernel = array, + stride = array + } : (tensor<1x4x4x1xf32>) -> tensor<1x4x4x1xf32> + return %0 : tensor<1x4x4x1xf32> +} + +func.func @max_pool_dynamic(%arg0: tensor) -> (tensor) { + %0 = tosa.max_pool2d %arg0 { + pad = array, + kernel = array, + stride = array + } : (tensor) -> tensor + return %0 : tensor +} + +// Test harness to compare the results of a fully statically shaped max_pool2d with +// a fully dynamically shaped max_pool2d on the same inputs. +func.func @main() { + %A = arith.constant dense<[[ + [[0.0], [0.1], [0.2], [0.3]], // H = 0 + [[1.0], [1.1], [1.2], [1.3]], // H = 1 + [[2.0], [2.1], [2.2], [2.3]], // H = 2 + [[3.0], [3.1], [3.2], [3.3]] // H = 3 + ]]> : tensor<1x4x4x1xf32> + + %A_dynamic = tensor.cast %A : !tensor_type to tensor + + // Call both static and dynamically sized variants + %result_static = func.call @max_pool_static(%A) : (!tensor_type) -> !tensor_type + %result_dynamic = func.call @max_pool_dynamic(%A_dynamic) : (tensor) -> tensor + + %static_buffer = bufferization.to_memref %result_static : !memref_type + %unranked_static_buffer = memref.cast %static_buffer : !memref_type to memref<*xf32> + + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 4, 4, 1] strides = [16, 4, 1, 1] data = + + // CHECK-NEXT: 1.1 + // CHECK-NEXT: 1.2 + // CHECK-NEXT: 1.3 + // CHECK-NEXT: 1.3 + + // CHECK-NEXT: 2.1 + // CHECK-NEXT: 2.2 + // CHECK-NEXT: 2.3 + // CHECK-NEXT: 2.3 + + // CHECK-NEXT: 3.1 + // CHECK-NEXT: 3.2 + // CHECK-NEXT: 3.3 + // CHECK-NEXT: 3.3 + + // CHECK-NEXT: 3.1 + // CHECK-NEXT: 3.2 + // CHECK-NEXT: 3.3 + // CHECK-NEXT: 3.3 + + func.call @printMemrefF32(%unranked_static_buffer) : (memref<*xf32>) -> () + + %dynamic_buffer = bufferization.to_memref %result_dynamic : memref + %unranked_dynamic_buffer = memref.cast %dynamic_buffer : memref to memref<*xf32> + + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 4, 4, 1] strides = [16, 4, 1, 1] data = + // CHECK-NEXT: 1.1 + // CHECK-NEXT: 1.2 + // CHECK-NEXT: 1.3 + // CHECK-NEXT: 1.3 + + // CHECK-NEXT: 2.1 + // CHECK-NEXT: 2.2 + // CHECK-NEXT: 2.3 + // CHECK-NEXT: 2.3 + + // CHECK-NEXT: 3.1 + // CHECK-NEXT: 3.2 + // CHECK-NEXT: 3.3 + // CHECK-NEXT: 3.3 + + // CHECK-NEXT: 3.1 + // CHECK-NEXT: 3.2 + // CHECK-NEXT: 3.3 + // CHECK-NEXT: 3.3 + + func.call @printMemrefF32(%unranked_dynamic_buffer) : (memref<*xf32>) -> () + + return +} + From d4602a96b48b00a50c4d891673fc622ca2e37f0a Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 16 Apr 2024 08:14:41 -0400 Subject: [PATCH 090/300] [gn] port fe48bf672e1ab2 --- llvm/utils/gn/secondary/lldb/test/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn index 414ea4933c519d..c8245739842d9e 100644 --- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn @@ -118,6 +118,7 @@ write_lit_cfg("lit_shell_site_cfg") { "LLDB_TOOLS_DIR=" + rebase_path("$root_out_dir/bin"), "LLDB_USE_SYSTEM_DEBUGSERVER=1", # XXX port //lldb/tools/debugserver (?) "LLVM_HOST_TRIPLE=$llvm_current_triple", + "LLVM_USE_SANITIZER=", ] if (llvm_enable_zlib) { From c309dc6d0759b23b570c563f611530ff1a49e1bd Mon Sep 17 00:00:00 2001 From: mahtohappy Date: Tue, 16 Apr 2024 17:48:45 +0530 Subject: [PATCH 091/300] [Clang][Sema] placement new initializes typedef array with correct size (#83124) When in-place new-ing a local variable of an array of trivial type, the generated code calls 'memset' with the correct size of the array, earlier it was generating size (squared of the typedef array + size). The cause: `typedef TYPE TArray[8]; TArray x;` The type of declarator is Tarray[8] and in `SemaExprCXX.cpp::BuildCXXNew` we check if it's of typedef and of constant size then we get the original type and it works fine for non-dependent cases. But in case of template we do `TreeTransform.h:TransformCXXNEWExpr` and there we again check the allocated type which is TArray[8] and it stays that way, so ArraySize=(Tarray[8] type, alloc Tarray[8*type]) so the squared size allocation. ArraySize gets calculated earlier in `TreeTransform.h` so that `if(!ArraySize)` condition was failing. fix: I changed that condition to `if(ArraySize)`. Fixes #41441 --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Sema/TreeTransform.h | 14 ++++++++++++- .../instantiate-new-placement-size.cpp | 20 +++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 clang/test/SemaCXX/instantiate-new-placement-size.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 76701dc723b6c3..255d2cc0440438 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -539,6 +539,8 @@ Bug Fixes to C++ Support Fixes (#GH70604), (#GH79754), (#GH84163), (#GH84425), (#GH86054), (#GH86398), and (#GH86399). - Fix a crash when deducing ``auto`` from an invalid dereference (#GH88329). - Fix a crash in requires expression with templated base class member function. Fixes (#GH84020). +- placement new initializes typedef array with correct size + (`#GH41441 `_) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 8c96134af7c8f0..9d15f3eacbb0f4 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -12802,6 +12802,19 @@ TreeTransform::TransformCXXNewExpr(CXXNewExpr *E) { ArraySize = NewArraySize.get(); } + // Per C++0x [expr.new]p5, the type being constructed may be a + // typedef of an array type. + QualType AllocType = AllocTypeInfo->getType(); + if (ArraySize) { + if (const ConstantArrayType *Array = + SemaRef.Context.getAsConstantArrayType(AllocType)) { + ArraySize = IntegerLiteral::Create(SemaRef.Context, Array->getSize(), + SemaRef.Context.getSizeType(), + E->getBeginLoc()); + AllocType = Array->getElementType(); + } + } + // Transform the placement arguments (if any). bool ArgumentChanged = false; SmallVector PlacementArgs; @@ -12863,7 +12876,6 @@ TreeTransform::TransformCXXNewExpr(CXXNewExpr *E) { return E; } - QualType AllocType = AllocTypeInfo->getType(); if (!ArraySize) { // If no array size was specified, but the new expression was // instantiated with an array type (e.g., "new T" where T is diff --git a/clang/test/SemaCXX/instantiate-new-placement-size.cpp b/clang/test/SemaCXX/instantiate-new-placement-size.cpp new file mode 100644 index 00000000000000..7a29d3dee8491e --- /dev/null +++ b/clang/test/SemaCXX/instantiate-new-placement-size.cpp @@ -0,0 +1,20 @@ +// RUN: %clang -S -fno-discard-value-names -emit-llvm -o - %s | FileCheck %s +// Issue no: 41441 +#include + +// CHECK: call void @llvm.memset.p0.i64(ptr align 1 %x, i8 0, i64 8, i1 false) +// CHECK: call void @llvm.memset.p0.i64(ptr align 16 %x, i8 0, i64 32, i1 false) +template +void f() +{ + typedef TYPE TArray[8]; + + TArray x; + new(&x) TArray(); +} + +int main() +{ + f(); + f(); +} From 34013e7ce25868aa8ddea116f79184e8603af56c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 16 Apr 2024 13:03:09 +0100 Subject: [PATCH 092/300] [X86] Add shuffle tests for BLEND(PERMUTE(X),PERMUTE(Y)) patterns Some very basic tests for a case where we could fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y)) These assume the permute masks are the same, and "complete" (no undefs/duplicate elements) but we could relax that depending on the blend mask --- .../X86/vector-shuffle-combining-avx.ll | 31 +++++++++++++ .../X86/vector-shuffle-combining-avx512f.ll | 44 +++++++++++++++++++ .../X86/vector-shuffle-combining-sse41.ll | 15 +++++++ 3 files changed, 90 insertions(+) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 0c76c14afb0aee..4859a8e0eaaa51 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -305,6 +305,37 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) { ret <4 x float> %2 } +define <8 x i32> @combine_blend_of_permutes_v8i32(<4 x i64> %a0, <4 x i64> %a1) { +; AVX1-LABEL: combine_blend_of_permutes_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6],ymm1[7] +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: combine_blend_of_permutes_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6],ymm1[7] +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: combine_blend_of_permutes_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [4,21,6,23,16,1,2,19] +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: ret{{[l|q]}} + %s0 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> + %s1 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> + %x0 = bitcast <4 x i64> %s0 to <8 x i32> + %x1 = bitcast <4 x i64> %s1 to <8 x i32> + %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <8 x i32> + ret <8 x i32> %r +} + define <2 x double> @constant_fold_vpermilvar_pd() { ; CHECK-LABEL: constant_fold_vpermilvar_pd: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll index f53b1eeaf8f54b..e87e810971e119 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -973,3 +973,47 @@ define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) { %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer) ret <8 x i64> %2 } + +define <16 x i32> @blend_of_permutes_v16i32(<8 x i64> %a0, <8x i64> %a1) { +; X86-AVX512F-LABEL: blend_of_permutes_v16i32: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] +; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] +; X86-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A +; X86-AVX512F-NEXT: kmovw %eax, %k1 +; X86-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; X86-AVX512F-NEXT: retl +; +; X86-AVX512BW-LABEL: blend_of_permutes_v16i32: +; X86-AVX512BW: # %bb.0: +; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] +; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] +; X86-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A +; X86-AVX512BW-NEXT: kmovd %eax, %k1 +; X86-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; X86-AVX512BW-NEXT: retl +; +; X64-AVX512F-LABEL: blend_of_permutes_v16i32: +; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] +; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] +; X64-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A +; X64-AVX512F-NEXT: kmovw %eax, %k1 +; X64-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; X64-AVX512F-NEXT: retq +; +; X64-AVX512BW-LABEL: blend_of_permutes_v16i32: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] +; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] +; X64-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A +; X64-AVX512BW-NEXT: kmovd %eax, %k1 +; X64-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; X64-AVX512BW-NEXT: retq + %s0 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> + %s1 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> + %x0 = bitcast <8 x i64> %s0 to <16 x i32> + %x1 = bitcast <8 x i64> %s1 to <16 x i32> + %r = shufflevector <16 x i32> %x0, <16 x i32> %x1, <16 x i32> + ret <16 x i32> %r +} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index 5eb017bc80ca58..33851f56fe8de5 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -22,6 +22,21 @@ define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) { ret <16 x i8> %res0 } +define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1) { +; SSE-LABEL: combine_blend_of_permutes_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; SSE-NEXT: retq + %s0 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> + %s1 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> + %x0 = bitcast <2 x i64> %s0 to <4 x i32> + %x1 = bitcast <2 x i64> %s1 to <4 x i32> + %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <4 x i32> + ret <4 x i32> %r +} + define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-LABEL: PR50049: ; SSE: # %bb.0: From b73476c7843f21966acb2fb5cab8515d9ec02905 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 16 Apr 2024 13:29:34 +0100 Subject: [PATCH 093/300] [SLP] Make sure MinVF is a power-of-2 by using PowerOf2Ceil. This should ensure we explore the same VFs as before 6d66db3890a18e39. Fixes https://github.com/llvm/llvm-project/issues/88640. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 4 +-- .../trunc-store-value-ty-not-power-of-2.ll | 33 +++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c63b500f546f3b..d0bcdceae392bd 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15155,8 +15155,8 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, Type *ValueTy = StoreTy; if (auto *Trunc = dyn_cast(Store->getValueOperand())) ValueTy = Trunc->getSrcTy(); - unsigned MinVF = TTI->getStoreMinimumVF( - R.getMinVF(DL->getTypeSizeInBits(StoreTy)), StoreTy, ValueTy); + unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF( + R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy)); if (MaxVF < MinVF) { LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF diff --git a/llvm/test/Transforms/SLPVectorizer/X86/trunc-store-value-ty-not-power-of-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/trunc-store-value-ty-not-power-of-2.ll index 81b4ee40e7fdf3..2f0fad70b593b5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/trunc-store-value-ty-not-power-of-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/trunc-store-value-ty-not-power-of-2.ll @@ -107,3 +107,36 @@ define void @test_4_trunc_i24_to_i16(i24 %x, ptr %A) { store i16 %t, ptr %gep.3, align 1 ret void } + +%struct.d = type { [3 x i8], [3 x i8], [2 x i8] } + +; Test case for https://github.com/llvm/llvm-project/issues/88640. +define void @test_access_i24_directly(ptr %src, ptr noalias %dst) "target-cpu"="btver2" { +; CHECK-LABEL: define void @test_access_i24_directly( +; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[SRC]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i24 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds [[STRUCT_D:%.*]], ptr [[SRC]], i64 0, i32 1 +; CHECK-NEXT: [[BF_LOAD:%.*]] = load i24, ptr [[GEP_SRC]], align 1 +; CHECK-NEXT: [[BF_VALUE:%.*]] = and i24 [[TMP1]], 8388607 +; CHECK-NEXT: [[BF_CLEAR:%.*]] = and i24 [[BF_LOAD]], -8388608 +; CHECK-NEXT: [[BF_SET:%.*]] = or disjoint i24 [[BF_CLEAR]], [[BF_VALUE]] +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds [[STRUCT_D]], ptr [[DST]], i64 0, i32 1 +; CHECK-NEXT: store i24 [[BF_SET]], ptr [[GEP_DST]], align 1 +; CHECK-NEXT: store i24 0, ptr [[DST]], align 8 +; CHECK-NEXT: ret void +; +entry: + %0 = load i64, ptr %src, align 8 + %1 = trunc i64 %0 to i24 + %gep.src = getelementptr inbounds %struct.d, ptr %src, i64 0, i32 1 + %bf.load = load i24, ptr %gep.src, align 1 + %bf.value = and i24 %1, 8388607 + %bf.clear = and i24 %bf.load, -8388608 + %bf.set = or disjoint i24 %bf.clear, %bf.value + %gep.dst = getelementptr inbounds %struct.d, ptr %dst, i64 0, i32 1 + store i24 %bf.set, ptr %gep.dst, align 1 + store i24 0, ptr %dst, align 8 + ret void +} From e272c37934a06cd80b9b072afc09afae5fd8c218 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 16 Apr 2024 08:31:35 -0400 Subject: [PATCH 094/300] clang; Try to get windows-seh-async-verify.cpp to pass on mac On macOS, file paths start with /Users/..., which clang-cl interptrets as the /U switch followed by a preprocessor macro name to undefine. Put the filename after `--` to prevent this. For consistency, move %s to the end of the regular `clang` lines (where this isn't needed) as well. --- clang/test/Driver/windows-seh-async-verify.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/test/Driver/windows-seh-async-verify.cpp b/clang/test/Driver/windows-seh-async-verify.cpp index 5fda6a77dba049..ace93cf44a31d2 100644 --- a/clang/test/Driver/windows-seh-async-verify.cpp +++ b/clang/test/Driver/windows-seh-async-verify.cpp @@ -1,7 +1,7 @@ -// RUN: %clang --target=x86_64-pc-windows -fasync-exceptions -fsyntax-only %s -### 2>&1 | FileCheck %s -// RUN: %clang_cl --target=x86_64-pc-windows /EHa -fsyntax-only %s -### 2>&1 | FileCheck %s -// RUN: %clang --target=x86_64-pc-windows-gnu -fasync-exceptions -fsyntax-only %s -### 2>&1 | FileCheck %s --check-prefixes=GNU-ALL,GNU -// RUN: %clang_cl --target=x86_64-pc-windows-gnu /EHa -fsyntax-only %s -### 2>&1 | FileCheck %s --check-prefixes=GNU-ALL,CL-GNU +// RUN: %clang --target=x86_64-pc-windows -fasync-exceptions -fsyntax-only -### %s 2>&1 | FileCheck %s +// RUN: %clang_cl --target=x86_64-pc-windows /EHa -fsyntax-only -### -- %s 2>&1 | FileCheck %s +// RUN: %clang --target=x86_64-pc-windows-gnu -fasync-exceptions -fsyntax-only -### %s 2>&1 | FileCheck %s --check-prefixes=GNU-ALL,GNU +// RUN: %clang_cl --target=x86_64-pc-windows-gnu /EHa -fsyntax-only -### -- %s 2>&1 | FileCheck %s --check-prefixes=GNU-ALL,CL-GNU // CHECK-NOT: warning // GNU: warning: argument unused during compilation: '-fasync-exceptions' [-Wunused-command-line-argument] From f69ded0d9965a6b2b76ce12db876c70f249d96d1 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Tue, 16 Apr 2024 16:36:53 +0400 Subject: [PATCH 095/300] [clang] Introduce `SemaOpenMP` (#88642) This patch moves OpenMP-related entities out of `Sema` to a newly created `SemaOpenMP` class. This is a part of the effort to split `Sema` up, and follows the recent example of CUDA, OpenACC, SYCL, HLSL. Additional context can be found in https://github.com/llvm/llvm-project/pull/82217, https://github.com/llvm/llvm-project/pull/84184, https://github.com/llvm/llvm-project/pull/87634. --- clang/include/clang/Parse/Parser.h | 15 +- clang/include/clang/Sema/Sema.h | 1419 +----- clang/include/clang/Sema/SemaOpenMP.h | 1447 ++++++ clang/lib/Parse/ParseDecl.cpp | 3 +- clang/lib/Parse/ParseExpr.cpp | 5 +- clang/lib/Parse/ParseOpenMP.cpp | 240 +- clang/lib/Parse/ParseStmt.cpp | 3 +- clang/lib/Sema/Sema.cpp | 25 +- clang/lib/Sema/SemaDecl.cpp | 32 +- clang/lib/Sema/SemaDeclCXX.cpp | 9 +- clang/lib/Sema/SemaExpr.cpp | 610 +-- clang/lib/Sema/SemaExprMember.cpp | 9 +- clang/lib/Sema/SemaLambda.cpp | 3 +- clang/lib/Sema/SemaOpenMP.cpp | 4333 ++++++++++------- clang/lib/Sema/SemaStmt.cpp | 6 +- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 60 +- clang/lib/Sema/SemaType.cpp | 3 +- clang/lib/Sema/TreeTransform.h | 684 +-- 18 files changed, 4565 insertions(+), 4341 deletions(-) create mode 100644 clang/include/clang/Sema/SemaOpenMP.h diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 5950dd74cfe83c..23b268126de4e0 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -18,6 +18,7 @@ #include "clang/Lex/CodeCompletionHandler.h" #include "clang/Lex/Preprocessor.h" #include "clang/Sema/Sema.h" +#include "clang/Sema/SemaOpenMP.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Frontend/OpenMP/OMPContext.h" #include "llvm/Support/SaveAndRestore.h" @@ -2537,7 +2538,7 @@ class Parser : public CodeCompletionHandler { /// Returns true for declaration, false for expression. bool isForInitDeclaration() { if (getLangOpts().OpenMP) - Actions.startOpenMPLoop(); + Actions.OpenMP().startOpenMPLoop(); if (getLangOpts().CPlusPlus) return Tok.is(tok::kw_using) || isCXXSimpleDeclaration(/*AllowForRangeDecl=*/true); @@ -3396,7 +3397,7 @@ class Parser : public CodeCompletionHandler { SourceLocation Loc); /// Parse clauses for '#pragma omp [begin] declare target'. - void ParseOMPDeclareTargetClauses(Sema::DeclareTargetContextInfo &DTCI); + void ParseOMPDeclareTargetClauses(SemaOpenMP::DeclareTargetContextInfo &DTCI); /// Parse '#pragma omp end declare target'. void ParseOMPEndDeclareTargetDirective(OpenMPDirectiveKind BeginDKind, @@ -3486,7 +3487,7 @@ class Parser : public CodeCompletionHandler { /// Parses indirect clause /// \param ParseOnly true to skip the clause's semantic actions and return // false; - bool ParseOpenMPIndirectClause(Sema::DeclareTargetContextInfo &DTCI, + bool ParseOpenMPIndirectClause(SemaOpenMP::DeclareTargetContextInfo &DTCI, bool ParseOnly); /// Parses clause with a single expression and an additional argument /// of a kind \a Kind. @@ -3556,12 +3557,12 @@ class Parser : public CodeCompletionHandler { /// Parses a reserved locator like 'omp_all_memory'. bool ParseOpenMPReservedLocator(OpenMPClauseKind Kind, - Sema::OpenMPVarListDataTy &Data, + SemaOpenMP::OpenMPVarListDataTy &Data, const LangOptions &LangOpts); /// Parses clauses with list. bool ParseOpenMPVarList(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind, SmallVectorImpl &Vars, - Sema::OpenMPVarListDataTy &Data); + SemaOpenMP::OpenMPVarListDataTy &Data); bool ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors, bool EnteringContext, bool AllowDestructorName, bool AllowConstructorName, @@ -3569,11 +3570,11 @@ class Parser : public CodeCompletionHandler { SourceLocation *TemplateKWLoc, UnqualifiedId &Result); /// Parses the mapper modifier in map, to, and from clauses. - bool parseMapperModifier(Sema::OpenMPVarListDataTy &Data); + bool parseMapperModifier(SemaOpenMP::OpenMPVarListDataTy &Data); /// Parses map-type-modifiers in map clause. /// map([ [map-type-modifier[,] [map-type-modifier[,] ...] map-type : ] list) /// where, map-type-modifier ::= always | close | mapper(mapper-identifier) - bool parseMapTypeModifiers(Sema::OpenMPVarListDataTy &Data); + bool parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data); //===--------------------------------------------------------------------===// // OpenACC Parsing. diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index d93ac7863b721d..a5fe83a539aaf8 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -26,14 +26,12 @@ #include "clang/AST/ExprCXX.h" #include "clang/AST/ExprConcepts.h" #include "clang/AST/ExprObjC.h" -#include "clang/AST/ExprOpenMP.h" #include "clang/AST/ExternalASTSource.h" #include "clang/AST/LocInfoType.h" #include "clang/AST/MangleNumberingContext.h" #include "clang/AST/NSAPI.h" #include "clang/AST/PrettyPrinter.h" #include "clang/AST/StmtCXX.h" -#include "clang/AST/StmtOpenMP.h" #include "clang/AST/TypeLoc.h" #include "clang/AST/TypeOrdering.h" #include "clang/Basic/BitmaskEnum.h" @@ -43,7 +41,6 @@ #include "clang/Basic/ExpressionTraits.h" #include "clang/Basic/Module.h" #include "clang/Basic/OpenCLOptions.h" -#include "clang/Basic/OpenMPKinds.h" #include "clang/Basic/PragmaKinds.h" #include "clang/Basic/Specifiers.h" #include "clang/Basic/TemplateKinds.h" @@ -68,7 +65,6 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TinyPtrVector.h" -#include "llvm/Frontend/OpenMP/OMPConstants.h" #include #include #include @@ -167,12 +163,6 @@ class ObjCMessageExpr; class ObjCMethodDecl; class ObjCPropertyDecl; class ObjCProtocolDecl; -class OMPThreadPrivateDecl; -class OMPRequiresDecl; -class OMPDeclareReductionDecl; -class OMPDeclareSimdDecl; -class OMPClause; -struct OMPVarListLocTy; struct OverloadCandidate; enum class OverloadCandidateParamOrder : char; enum OverloadCandidateRewriteKind : unsigned; @@ -187,6 +177,7 @@ class QualType; class SemaCUDA; class SemaHLSL; class SemaOpenACC; +class SemaOpenMP; class SemaSYCL; class StandardConversionSequence; class Stmt; @@ -480,7 +471,6 @@ class Sema final : public SemaBase { // 35. Code Completion (SemaCodeComplete.cpp) // 36. FixIt Helpers (SemaFixItUtils.cpp) // 37. Name Lookup for RISC-V Vector Intrinsic (SemaRISCVVectorLookup.cpp) - // 38. OpenMP Directives and Clauses (SemaOpenMP.cpp) /// \name Semantic Analysis /// Implementations are in Sema.cpp @@ -997,6 +987,11 @@ class Sema final : public SemaBase { return *OpenACCPtr; } + SemaOpenMP &OpenMP() { + assert(OpenMPPtr && "SemaOpenMP is dead"); + return *OpenMPPtr; + } + SemaSYCL &SYCL() { assert(SYCLPtr); return *SYCLPtr; @@ -1035,6 +1030,7 @@ class Sema final : public SemaBase { std::unique_ptr CUDAPtr; std::unique_ptr HLSLPtr; std::unique_ptr OpenACCPtr; + std::unique_ptr OpenMPPtr; std::unique_ptr SYCLPtr; ///@} @@ -3443,14 +3439,6 @@ class Sema final : public SemaBase { sema::LambdaScopeInfo *RebuildLambdaScopeInfo(CXXMethodDecl *CallOperator); - /// The declarator \p D defines a function in the scope \p S which is nested - /// in an `omp begin/end declare variant` scope. In this method we create a - /// declaration for \p D and rename \p D according to the OpenMP context - /// selector of the surrounding scope. Return all base functions in \p Bases. - void ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( - Scope *S, Declarator &D, MultiTemplateParamsArg TemplateParameterLists, - SmallVectorImpl &Bases); - // Heuristically tells if the function is `get_return_object` member of a // coroutine promise_type by matching the function name. static bool CanBeGetReturnObject(const FunctionDecl *FD); @@ -5533,32 +5521,6 @@ class Sema final : public SemaBase { Expr *ColumnIdx, SourceLocation RBLoc); - ExprResult ActOnOMPArraySectionExpr(Expr *Base, SourceLocation LBLoc, - Expr *LowerBound, - SourceLocation ColonLocFirst, - SourceLocation ColonLocSecond, - Expr *Length, Expr *Stride, - SourceLocation RBLoc); - ExprResult ActOnOMPArrayShapingExpr(Expr *Base, SourceLocation LParenLoc, - SourceLocation RParenLoc, - ArrayRef Dims, - ArrayRef Brackets); - - /// Data structure for iterator expression. - struct OMPIteratorData { - IdentifierInfo *DeclIdent = nullptr; - SourceLocation DeclIdentLoc; - ParsedType Type; - OMPIteratorExpr::IteratorRange Range; - SourceLocation AssignLoc; - SourceLocation ColonLoc; - SourceLocation SecColonLoc; - }; - - ExprResult ActOnOMPIteratorExpr(Scope *S, SourceLocation IteratorKwLoc, - SourceLocation LLoc, SourceLocation RLoc, - ArrayRef Data); - bool ConvertArgumentsForCall(CallExpr *Call, Expr *Fn, FunctionDecl *FDecl, const FunctionProtoType *Proto, ArrayRef Args, SourceLocation RParenLoc, @@ -12863,1373 +12825,6 @@ class Sema final : public SemaBase { std::unique_ptr RVIntrinsicManager; ///@} - - // - // - // ------------------------------------------------------------------------- - // - // - - /// \name OpenMP Directives and Clauses - /// Implementations are in SemaOpenMP.cpp - ///@{ - -public: - /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current - /// context is "used as device code". - /// - /// - If CurContext is a `declare target` function or it is known that the - /// function is emitted for the device, emits the diagnostics immediately. - /// - If CurContext is a non-`declare target` function and we are compiling - /// for the device, creates a diagnostic which is emitted if and when we - /// realize that the function will be codegen'ed. - /// - /// Example usage: - /// - /// // Variable-length arrays are not allowed in NVPTX device code. - /// if (diagIfOpenMPDeviceCode(Loc, diag::err_vla_unsupported)) - /// return ExprError(); - /// // Otherwise, continue parsing as normal. - SemaDiagnosticBuilder diagIfOpenMPDeviceCode(SourceLocation Loc, - unsigned DiagID, - const FunctionDecl *FD); - - /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current - /// context is "used as host code". - /// - /// - If CurContext is a `declare target` function or it is known that the - /// function is emitted for the host, emits the diagnostics immediately. - /// - If CurContext is a non-host function, just ignore it. - /// - /// Example usage: - /// - /// // Variable-length arrays are not allowed in NVPTX device code. - /// if (diagIfOpenMPHostode(Loc, diag::err_vla_unsupported)) - /// return ExprError(); - /// // Otherwise, continue parsing as normal. - SemaDiagnosticBuilder diagIfOpenMPHostCode(SourceLocation Loc, - unsigned DiagID, - const FunctionDecl *FD); - - /// Register \p D as specialization of all base functions in \p Bases in the - /// current `omp begin/end declare variant` scope. - void ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope( - Decl *D, SmallVectorImpl &Bases); - - /// Act on \p D, a function definition inside of an `omp [begin/end] assumes`. - void ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D); - - /// Can we exit an OpenMP declare variant scope at the moment. - bool isInOpenMPDeclareVariantScope() const { - return !OMPDeclareVariantScopes.empty(); - } - - ExprResult - VerifyPositiveIntegerConstantInClause(Expr *Op, OpenMPClauseKind CKind, - bool StrictlyPositive = true, - bool SuppressExprDiags = false); - - /// Given the potential call expression \p Call, determine if there is a - /// specialization via the OpenMP declare variant mechanism available. If - /// there is, return the specialized call expression, otherwise return the - /// original \p Call. - ExprResult ActOnOpenMPCall(ExprResult Call, Scope *Scope, - SourceLocation LParenLoc, MultiExprArg ArgExprs, - SourceLocation RParenLoc, Expr *ExecConfig); - - /// Handle a `omp begin declare variant`. - void ActOnOpenMPBeginDeclareVariant(SourceLocation Loc, OMPTraitInfo &TI); - - /// Handle a `omp end declare variant`. - void ActOnOpenMPEndDeclareVariant(); - - /// Function tries to capture lambda's captured variables in the OpenMP region - /// before the original lambda is captured. - void tryCaptureOpenMPLambdas(ValueDecl *V); - - /// Return true if the provided declaration \a VD should be captured by - /// reference. - /// \param Level Relative level of nested OpenMP construct for that the check - /// is performed. - /// \param OpenMPCaptureLevel Capture level within an OpenMP construct. - bool isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level, - unsigned OpenMPCaptureLevel) const; - - /// Check if the specified variable is used in one of the private - /// clauses (private, firstprivate, lastprivate, reduction etc.) in OpenMP - /// constructs. - VarDecl *isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo = false, - unsigned StopAt = 0); - - /// The member expression(this->fd) needs to be rebuilt in the template - /// instantiation to generate private copy for OpenMP when default - /// clause is used. The function will return true if default - /// cluse is used. - bool isOpenMPRebuildMemberExpr(ValueDecl *D); - - ExprResult getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK, - ExprObjectKind OK, SourceLocation Loc); - - /// If the current region is a loop-based region, mark the start of the loop - /// construct. - void startOpenMPLoop(); - - /// If the current region is a range loop-based region, mark the start of the - /// loop construct. - void startOpenMPCXXRangeFor(); - - /// Check if the specified variable is used in 'private' clause. - /// \param Level Relative level of nested OpenMP construct for that the check - /// is performed. - OpenMPClauseKind isOpenMPPrivateDecl(ValueDecl *D, unsigned Level, - unsigned CapLevel) const; - - /// Sets OpenMP capture kind (OMPC_private, OMPC_firstprivate, OMPC_map etc.) - /// for \p FD based on DSA for the provided corresponding captured declaration - /// \p D. - void setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D, unsigned Level); - - /// Check if the specified variable is captured by 'target' directive. - /// \param Level Relative level of nested OpenMP construct for that the check - /// is performed. - bool isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level, - unsigned CaptureLevel) const; - - /// Check if the specified global variable must be captured by outer capture - /// regions. - /// \param Level Relative level of nested OpenMP construct for that - /// the check is performed. - bool isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level, - unsigned CaptureLevel) const; - - ExprResult PerformOpenMPImplicitIntegerConversion(SourceLocation OpLoc, - Expr *Op); - /// Called on start of new data sharing attribute block. - void StartOpenMPDSABlock(OpenMPDirectiveKind K, - const DeclarationNameInfo &DirName, Scope *CurScope, - SourceLocation Loc); - /// Start analysis of clauses. - void StartOpenMPClause(OpenMPClauseKind K); - /// End analysis of clauses. - void EndOpenMPClause(); - /// Called on end of data sharing attribute block. - void EndOpenMPDSABlock(Stmt *CurDirective); - - /// Check if the current region is an OpenMP loop region and if it is, - /// mark loop control variable, used in \p Init for loop initialization, as - /// private by default. - /// \param Init First part of the for loop. - void ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init); - - /// Called on well-formed '\#pragma omp metadirective' after parsing - /// of the associated statement. - StmtResult ActOnOpenMPMetaDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - - // OpenMP directives and clauses. - /// Called on correct id-expression from the '#pragma omp - /// threadprivate'. - ExprResult ActOnOpenMPIdExpression(Scope *CurScope, CXXScopeSpec &ScopeSpec, - const DeclarationNameInfo &Id, - OpenMPDirectiveKind Kind); - /// Called on well-formed '#pragma omp threadprivate'. - DeclGroupPtrTy ActOnOpenMPThreadprivateDirective(SourceLocation Loc, - ArrayRef VarList); - /// Builds a new OpenMPThreadPrivateDecl and checks its correctness. - OMPThreadPrivateDecl *CheckOMPThreadPrivateDecl(SourceLocation Loc, - ArrayRef VarList); - /// Called on well-formed '#pragma omp allocate'. - DeclGroupPtrTy ActOnOpenMPAllocateDirective(SourceLocation Loc, - ArrayRef VarList, - ArrayRef Clauses, - DeclContext *Owner = nullptr); - - /// Called on well-formed '#pragma omp [begin] assume[s]'. - void ActOnOpenMPAssumesDirective(SourceLocation Loc, - OpenMPDirectiveKind DKind, - ArrayRef Assumptions, - bool SkippedClauses); - - /// Check if there is an active global `omp begin assumes` directive. - bool isInOpenMPAssumeScope() const { return !OMPAssumeScoped.empty(); } - - /// Check if there is an active global `omp assumes` directive. - bool hasGlobalOpenMPAssumes() const { return !OMPAssumeGlobal.empty(); } - - /// Called on well-formed '#pragma omp end assumes'. - void ActOnOpenMPEndAssumesDirective(); - - /// Called on well-formed '#pragma omp requires'. - DeclGroupPtrTy ActOnOpenMPRequiresDirective(SourceLocation Loc, - ArrayRef ClauseList); - /// Check restrictions on Requires directive - OMPRequiresDecl *CheckOMPRequiresDecl(SourceLocation Loc, - ArrayRef Clauses); - /// Check if the specified type is allowed to be used in 'omp declare - /// reduction' construct. - QualType ActOnOpenMPDeclareReductionType(SourceLocation TyLoc, - TypeResult ParsedType); - /// Called on start of '#pragma omp declare reduction'. - DeclGroupPtrTy ActOnOpenMPDeclareReductionDirectiveStart( - Scope *S, DeclContext *DC, DeclarationName Name, - ArrayRef> ReductionTypes, - AccessSpecifier AS, Decl *PrevDeclInScope = nullptr); - /// Initialize declare reduction construct initializer. - void ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D); - /// Finish current declare reduction construct initializer. - void ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner); - /// Initialize declare reduction construct initializer. - /// \return omp_priv variable. - VarDecl *ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D); - /// Finish current declare reduction construct initializer. - void ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer, - VarDecl *OmpPrivParm); - /// Called at the end of '#pragma omp declare reduction'. - DeclGroupPtrTy ActOnOpenMPDeclareReductionDirectiveEnd( - Scope *S, DeclGroupPtrTy DeclReductions, bool IsValid); - - /// Check variable declaration in 'omp declare mapper' construct. - TypeResult ActOnOpenMPDeclareMapperVarDecl(Scope *S, Declarator &D); - /// Check if the specified type is allowed to be used in 'omp declare - /// mapper' construct. - QualType ActOnOpenMPDeclareMapperType(SourceLocation TyLoc, - TypeResult ParsedType); - /// Called on start of '#pragma omp declare mapper'. - DeclGroupPtrTy ActOnOpenMPDeclareMapperDirective( - Scope *S, DeclContext *DC, DeclarationName Name, QualType MapperType, - SourceLocation StartLoc, DeclarationName VN, AccessSpecifier AS, - Expr *MapperVarRef, ArrayRef Clauses, - Decl *PrevDeclInScope = nullptr); - /// Build the mapper variable of '#pragma omp declare mapper'. - ExprResult ActOnOpenMPDeclareMapperDirectiveVarDecl(Scope *S, - QualType MapperType, - SourceLocation StartLoc, - DeclarationName VN); - void ActOnOpenMPIteratorVarDecl(VarDecl *VD); - bool isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const; - const ValueDecl *getOpenMPDeclareMapperVarName() const; - - struct DeclareTargetContextInfo { - struct MapInfo { - OMPDeclareTargetDeclAttr::MapTypeTy MT; - SourceLocation Loc; - }; - /// Explicitly listed variables and functions in a 'to' or 'link' clause. - llvm::DenseMap ExplicitlyMapped; - - /// The 'device_type' as parsed from the clause. - OMPDeclareTargetDeclAttr::DevTypeTy DT = OMPDeclareTargetDeclAttr::DT_Any; - - /// The directive kind, `begin declare target` or `declare target`. - OpenMPDirectiveKind Kind; - - /// The directive with indirect clause. - std::optional Indirect; - - /// The directive location. - SourceLocation Loc; - - DeclareTargetContextInfo(OpenMPDirectiveKind Kind, SourceLocation Loc) - : Kind(Kind), Loc(Loc) {} - }; - - /// Called on the start of target region i.e. '#pragma omp declare target'. - bool ActOnStartOpenMPDeclareTargetContext(DeclareTargetContextInfo &DTCI); - - /// Called at the end of target region i.e. '#pragma omp end declare target'. - const DeclareTargetContextInfo ActOnOpenMPEndDeclareTargetDirective(); - - /// Called once a target context is completed, that can be when a - /// '#pragma omp end declare target' was encountered or when a - /// '#pragma omp declare target' without declaration-definition-seq was - /// encountered. - void ActOnFinishedOpenMPDeclareTargetContext(DeclareTargetContextInfo &DTCI); - - /// Report unterminated 'omp declare target' or 'omp begin declare target' at - /// the end of a compilation unit. - void DiagnoseUnterminatedOpenMPDeclareTarget(); - - /// Searches for the provided declaration name for OpenMP declare target - /// directive. - NamedDecl *lookupOpenMPDeclareTargetName(Scope *CurScope, - CXXScopeSpec &ScopeSpec, - const DeclarationNameInfo &Id); - - /// Called on correct id-expression from the '#pragma omp declare target'. - void ActOnOpenMPDeclareTargetName(NamedDecl *ND, SourceLocation Loc, - OMPDeclareTargetDeclAttr::MapTypeTy MT, - DeclareTargetContextInfo &DTCI); - - /// Check declaration inside target region. - void - checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, - SourceLocation IdLoc = SourceLocation()); - - /// Adds OMPDeclareTargetDeclAttr to referenced variables in declare target - /// directive. - void ActOnOpenMPDeclareTargetInitializer(Decl *D); - - /// Finishes analysis of the deferred functions calls that may be declared as - /// host/nohost during device/host compilation. - void finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller, - const FunctionDecl *Callee, - SourceLocation Loc); - - /// Return true if currently in OpenMP task with untied clause context. - bool isInOpenMPTaskUntiedContext() const; - - /// Return true inside OpenMP declare target region. - bool isInOpenMPDeclareTargetContext() const { - return !DeclareTargetNesting.empty(); - } - /// Return true inside OpenMP target region. - bool isInOpenMPTargetExecutionDirective() const; - - /// Return the number of captured regions created for an OpenMP directive. - static int getOpenMPCaptureLevels(OpenMPDirectiveKind Kind); - - /// Initialization of captured region for OpenMP region. - void ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope); - - /// Called for syntactical loops (ForStmt or CXXForRangeStmt) associated to - /// an OpenMP loop directive. - StmtResult ActOnOpenMPCanonicalLoop(Stmt *AStmt); - - /// Process a canonical OpenMP loop nest that can either be a canonical - /// literal loop (ForStmt or CXXForRangeStmt), or the generated loop of an - /// OpenMP loop transformation construct. - StmtResult ActOnOpenMPLoopnest(Stmt *AStmt); - - /// End of OpenMP region. - /// - /// \param S Statement associated with the current OpenMP region. - /// \param Clauses List of clauses for the current OpenMP region. - /// - /// \returns Statement for finished OpenMP region. - StmtResult ActOnOpenMPRegionEnd(StmtResult S, ArrayRef Clauses); - StmtResult ActOnOpenMPExecutableDirective( - OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName, - OpenMPDirectiveKind CancelRegion, ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, - OpenMPDirectiveKind PrevMappedDirective = llvm::omp::OMPD_unknown); - /// Called on well-formed '\#pragma omp parallel' after parsing - /// of the associated statement. - StmtResult ActOnOpenMPParallelDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - using VarsWithInheritedDSAType = - llvm::SmallDenseMap; - /// Called on well-formed '\#pragma omp simd' after parsing - /// of the associated statement. - StmtResult - ActOnOpenMPSimdDirective(ArrayRef Clauses, Stmt *AStmt, - SourceLocation StartLoc, SourceLocation EndLoc, - VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '#pragma omp tile' after parsing of its clauses and - /// the associated statement. - StmtResult ActOnOpenMPTileDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '#pragma omp unroll' after parsing of its clauses - /// and the associated statement. - StmtResult ActOnOpenMPUnrollDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp for' after parsing - /// of the associated statement. - StmtResult - ActOnOpenMPForDirective(ArrayRef Clauses, Stmt *AStmt, - SourceLocation StartLoc, SourceLocation EndLoc, - VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp for simd' after parsing - /// of the associated statement. - StmtResult - ActOnOpenMPForSimdDirective(ArrayRef Clauses, Stmt *AStmt, - SourceLocation StartLoc, SourceLocation EndLoc, - VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp sections' after parsing - /// of the associated statement. - StmtResult ActOnOpenMPSectionsDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp section' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPSectionDirective(Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp scope' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPScopeDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp single' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPSingleDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp master' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPMasterDirective(Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp critical' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPCriticalDirective(const DeclarationNameInfo &DirName, - ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp parallel for' after parsing - /// of the associated statement. - StmtResult ActOnOpenMPParallelForDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp parallel for simd' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPParallelForSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp parallel master' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPParallelMasterDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp parallel masked' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPParallelMaskedDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp parallel sections' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPParallelSectionsDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp task' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPTaskDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp taskyield'. - StmtResult ActOnOpenMPTaskyieldDirective(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp error'. - /// Error direcitive is allowed in both declared and excutable contexts. - /// Adding InExContext to identify which context is called from. - StmtResult ActOnOpenMPErrorDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc, - bool InExContext = true); - /// Called on well-formed '\#pragma omp barrier'. - StmtResult ActOnOpenMPBarrierDirective(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp taskwait'. - StmtResult ActOnOpenMPTaskwaitDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp taskgroup'. - StmtResult ActOnOpenMPTaskgroupDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp flush'. - StmtResult ActOnOpenMPFlushDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp depobj'. - StmtResult ActOnOpenMPDepobjDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp scan'. - StmtResult ActOnOpenMPScanDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp ordered' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPOrderedDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp atomic' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPAtomicDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp target' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPTargetDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp target data' after parsing of - /// the associated statement. - StmtResult ActOnOpenMPTargetDataDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp target enter data' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPTargetEnterDataDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc, - Stmt *AStmt); - /// Called on well-formed '\#pragma omp target exit data' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPTargetExitDataDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc, - Stmt *AStmt); - /// Called on well-formed '\#pragma omp target parallel' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPTargetParallelDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp target parallel for' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPTargetParallelForDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp teams' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPTeamsDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp teams loop' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPTeamsGenericLoopDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp target teams loop' after parsing of - /// the associated statement. - StmtResult ActOnOpenMPTargetTeamsGenericLoopDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp parallel loop' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPParallelGenericLoopDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp target parallel loop' after parsing - /// of the associated statement. - StmtResult ActOnOpenMPTargetParallelGenericLoopDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp cancellation point'. - StmtResult - ActOnOpenMPCancellationPointDirective(SourceLocation StartLoc, - SourceLocation EndLoc, - OpenMPDirectiveKind CancelRegion); - /// Called on well-formed '\#pragma omp cancel'. - StmtResult ActOnOpenMPCancelDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc, - OpenMPDirectiveKind CancelRegion); - /// Called on well-formed '\#pragma omp taskloop' after parsing of the - /// associated statement. - StmtResult - ActOnOpenMPTaskLoopDirective(ArrayRef Clauses, Stmt *AStmt, - SourceLocation StartLoc, SourceLocation EndLoc, - VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp taskloop simd' after parsing of - /// the associated statement. - StmtResult ActOnOpenMPTaskLoopSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp master taskloop' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPMasterTaskLoopDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp master taskloop simd' after parsing of - /// the associated statement. - StmtResult ActOnOpenMPMasterTaskLoopSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp parallel master taskloop' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPParallelMasterTaskLoopDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp parallel master taskloop simd' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPParallelMasterTaskLoopSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp masked taskloop' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPMaskedTaskLoopDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp masked taskloop simd' after parsing of - /// the associated statement. - StmtResult ActOnOpenMPMaskedTaskLoopSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp parallel masked taskloop' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPParallelMaskedTaskLoopDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp parallel masked taskloop simd' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPParallelMaskedTaskLoopSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp distribute' after parsing - /// of the associated statement. - StmtResult - ActOnOpenMPDistributeDirective(ArrayRef Clauses, Stmt *AStmt, - SourceLocation StartLoc, SourceLocation EndLoc, - VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp target update'. - StmtResult ActOnOpenMPTargetUpdateDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc, - Stmt *AStmt); - /// Called on well-formed '\#pragma omp distribute parallel for' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPDistributeParallelForDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp distribute parallel for simd' - /// after parsing of the associated statement. - StmtResult ActOnOpenMPDistributeParallelForSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp distribute simd' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPDistributeSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp target parallel for simd' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPTargetParallelForSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp target simd' after parsing of - /// the associated statement. - StmtResult - ActOnOpenMPTargetSimdDirective(ArrayRef Clauses, Stmt *AStmt, - SourceLocation StartLoc, SourceLocation EndLoc, - VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp teams distribute' after parsing of - /// the associated statement. - StmtResult ActOnOpenMPTeamsDistributeDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp teams distribute simd' after parsing - /// of the associated statement. - StmtResult ActOnOpenMPTeamsDistributeSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp teams distribute parallel for simd' - /// after parsing of the associated statement. - StmtResult ActOnOpenMPTeamsDistributeParallelForSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp teams distribute parallel for' - /// after parsing of the associated statement. - StmtResult ActOnOpenMPTeamsDistributeParallelForDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp target teams' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp target teams distribute' after parsing - /// of the associated statement. - StmtResult ActOnOpenMPTargetTeamsDistributeDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp target teams distribute parallel for' - /// after parsing of the associated statement. - StmtResult ActOnOpenMPTargetTeamsDistributeParallelForDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp target teams distribute parallel for - /// simd' after parsing of the associated statement. - StmtResult ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp target teams distribute simd' after - /// parsing of the associated statement. - StmtResult ActOnOpenMPTargetTeamsDistributeSimdDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - /// Called on well-formed '\#pragma omp interop'. - StmtResult ActOnOpenMPInteropDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp dispatch' after parsing of the - // /associated statement. - StmtResult ActOnOpenMPDispatchDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed '\#pragma omp masked' after parsing of the - // /associated statement. - StmtResult ActOnOpenMPMaskedDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc); - - /// Called on well-formed '\#pragma omp loop' after parsing of the - /// associated statement. - StmtResult ActOnOpenMPGenericLoopDirective( - ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); - - /// Checks correctness of linear modifiers. - bool CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind, - SourceLocation LinLoc); - /// Checks that the specified declaration matches requirements for the linear - /// decls. - bool CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc, - OpenMPLinearClauseKind LinKind, QualType Type, - bool IsDeclareSimd = false); - - /// Called on well-formed '\#pragma omp declare simd' after parsing of - /// the associated method/function. - DeclGroupPtrTy ActOnOpenMPDeclareSimdDirective( - DeclGroupPtrTy DG, OMPDeclareSimdDeclAttr::BranchStateTy BS, - Expr *Simdlen, ArrayRef Uniforms, ArrayRef Aligneds, - ArrayRef Alignments, ArrayRef Linears, - ArrayRef LinModifiers, ArrayRef Steps, SourceRange SR); - - /// Checks '\#pragma omp declare variant' variant function and original - /// functions after parsing of the associated method/function. - /// \param DG Function declaration to which declare variant directive is - /// applied to. - /// \param VariantRef Expression that references the variant function, which - /// must be used instead of the original one, specified in \p DG. - /// \param TI The trait info object representing the match clause. - /// \param NumAppendArgs The number of omp_interop_t arguments to account for - /// in checking. - /// \returns std::nullopt, if the function/variant function are not compatible - /// with the pragma, pair of original function/variant ref expression - /// otherwise. - std::optional> - checkOpenMPDeclareVariantFunction(DeclGroupPtrTy DG, Expr *VariantRef, - OMPTraitInfo &TI, unsigned NumAppendArgs, - SourceRange SR); - - /// Called on well-formed '\#pragma omp declare variant' after parsing of - /// the associated method/function. - /// \param FD Function declaration to which declare variant directive is - /// applied to. - /// \param VariantRef Expression that references the variant function, which - /// must be used instead of the original one, specified in \p DG. - /// \param TI The context traits associated with the function variant. - /// \param AdjustArgsNothing The list of 'nothing' arguments. - /// \param AdjustArgsNeedDevicePtr The list of 'need_device_ptr' arguments. - /// \param AppendArgs The list of 'append_args' arguments. - /// \param AdjustArgsLoc The Location of an 'adjust_args' clause. - /// \param AppendArgsLoc The Location of an 'append_args' clause. - /// \param SR The SourceRange of the 'declare variant' directive. - void ActOnOpenMPDeclareVariantDirective( - FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI, - ArrayRef AdjustArgsNothing, - ArrayRef AdjustArgsNeedDevicePtr, - ArrayRef AppendArgs, SourceLocation AdjustArgsLoc, - SourceLocation AppendArgsLoc, SourceRange SR); - - OMPClause *ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'allocator' clause. - OMPClause *ActOnOpenMPAllocatorClause(Expr *Allocator, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'if' clause. - OMPClause *ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier, - Expr *Condition, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation NameModifierLoc, - SourceLocation ColonLoc, - SourceLocation EndLoc); - /// Called on well-formed 'final' clause. - OMPClause *ActOnOpenMPFinalClause(Expr *Condition, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'num_threads' clause. - OMPClause *ActOnOpenMPNumThreadsClause(Expr *NumThreads, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'align' clause. - OMPClause *ActOnOpenMPAlignClause(Expr *Alignment, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'safelen' clause. - OMPClause *ActOnOpenMPSafelenClause(Expr *Length, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'simdlen' clause. - OMPClause *ActOnOpenMPSimdlenClause(Expr *Length, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-form 'sizes' clause. - OMPClause *ActOnOpenMPSizesClause(ArrayRef SizeExprs, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-form 'full' clauses. - OMPClause *ActOnOpenMPFullClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-form 'partial' clauses. - OMPClause *ActOnOpenMPPartialClause(Expr *FactorExpr, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'collapse' clause. - OMPClause *ActOnOpenMPCollapseClause(Expr *NumForLoops, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'ordered' clause. - OMPClause * - ActOnOpenMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc, - SourceLocation LParenLoc = SourceLocation(), - Expr *NumForLoops = nullptr); - /// Called on well-formed 'grainsize' clause. - OMPClause *ActOnOpenMPGrainsizeClause(OpenMPGrainsizeClauseModifier Modifier, - Expr *Size, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation ModifierLoc, - SourceLocation EndLoc); - /// Called on well-formed 'num_tasks' clause. - OMPClause *ActOnOpenMPNumTasksClause(OpenMPNumTasksClauseModifier Modifier, - Expr *NumTasks, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation ModifierLoc, - SourceLocation EndLoc); - /// Called on well-formed 'hint' clause. - OMPClause *ActOnOpenMPHintClause(Expr *Hint, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'detach' clause. - OMPClause *ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - - OMPClause *ActOnOpenMPSimpleClause(OpenMPClauseKind Kind, unsigned Argument, - SourceLocation ArgumentLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'when' clause. - OMPClause *ActOnOpenMPWhenClause(OMPTraitInfo &TI, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'default' clause. - OMPClause *ActOnOpenMPDefaultClause(llvm::omp::DefaultKind Kind, - SourceLocation KindLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'proc_bind' clause. - OMPClause *ActOnOpenMPProcBindClause(llvm::omp::ProcBindKind Kind, - SourceLocation KindLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'order' clause. - OMPClause *ActOnOpenMPOrderClause(OpenMPOrderClauseModifier Modifier, - OpenMPOrderClauseKind Kind, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation MLoc, SourceLocation KindLoc, - SourceLocation EndLoc); - /// Called on well-formed 'update' clause. - OMPClause *ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind, - SourceLocation KindLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - - OMPClause *ActOnOpenMPSingleExprWithArgClause( - OpenMPClauseKind Kind, ArrayRef Arguments, Expr *Expr, - SourceLocation StartLoc, SourceLocation LParenLoc, - ArrayRef ArgumentsLoc, SourceLocation DelimLoc, - SourceLocation EndLoc); - /// Called on well-formed 'schedule' clause. - OMPClause *ActOnOpenMPScheduleClause( - OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2, - OpenMPScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc, - SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc, - SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc); - - OMPClause *ActOnOpenMPClause(OpenMPClauseKind Kind, SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'nowait' clause. - OMPClause *ActOnOpenMPNowaitClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'untied' clause. - OMPClause *ActOnOpenMPUntiedClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'mergeable' clause. - OMPClause *ActOnOpenMPMergeableClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'read' clause. - OMPClause *ActOnOpenMPReadClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'write' clause. - OMPClause *ActOnOpenMPWriteClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'update' clause. - OMPClause *ActOnOpenMPUpdateClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'capture' clause. - OMPClause *ActOnOpenMPCaptureClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'compare' clause. - OMPClause *ActOnOpenMPCompareClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'fail' clause. - OMPClause *ActOnOpenMPFailClause(SourceLocation StartLoc, - SourceLocation EndLoc); - OMPClause *ActOnOpenMPFailClause(OpenMPClauseKind Kind, - SourceLocation KindLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - - /// Called on well-formed 'seq_cst' clause. - OMPClause *ActOnOpenMPSeqCstClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'acq_rel' clause. - OMPClause *ActOnOpenMPAcqRelClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'acquire' clause. - OMPClause *ActOnOpenMPAcquireClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'release' clause. - OMPClause *ActOnOpenMPReleaseClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'relaxed' clause. - OMPClause *ActOnOpenMPRelaxedClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'weak' clause. - OMPClause *ActOnOpenMPWeakClause(SourceLocation StartLoc, - SourceLocation EndLoc); - - /// Called on well-formed 'init' clause. - OMPClause * - ActOnOpenMPInitClause(Expr *InteropVar, OMPInteropInfo &InteropInfo, - SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation VarLoc, SourceLocation EndLoc); - - /// Called on well-formed 'use' clause. - OMPClause *ActOnOpenMPUseClause(Expr *InteropVar, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation VarLoc, SourceLocation EndLoc); - - /// Called on well-formed 'destroy' clause. - OMPClause *ActOnOpenMPDestroyClause(Expr *InteropVar, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation VarLoc, - SourceLocation EndLoc); - /// Called on well-formed 'novariants' clause. - OMPClause *ActOnOpenMPNovariantsClause(Expr *Condition, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'nocontext' clause. - OMPClause *ActOnOpenMPNocontextClause(Expr *Condition, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'filter' clause. - OMPClause *ActOnOpenMPFilterClause(Expr *ThreadID, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'threads' clause. - OMPClause *ActOnOpenMPThreadsClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'simd' clause. - OMPClause *ActOnOpenMPSIMDClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'nogroup' clause. - OMPClause *ActOnOpenMPNogroupClause(SourceLocation StartLoc, - SourceLocation EndLoc); - /// Called on well-formed 'unified_address' clause. - OMPClause *ActOnOpenMPUnifiedAddressClause(SourceLocation StartLoc, - SourceLocation EndLoc); - - /// Called on well-formed 'unified_address' clause. - OMPClause *ActOnOpenMPUnifiedSharedMemoryClause(SourceLocation StartLoc, - SourceLocation EndLoc); - - /// Called on well-formed 'reverse_offload' clause. - OMPClause *ActOnOpenMPReverseOffloadClause(SourceLocation StartLoc, - SourceLocation EndLoc); - - /// Called on well-formed 'dynamic_allocators' clause. - OMPClause *ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc, - SourceLocation EndLoc); - - /// Called on well-formed 'atomic_default_mem_order' clause. - OMPClause *ActOnOpenMPAtomicDefaultMemOrderClause( - OpenMPAtomicDefaultMemOrderClauseKind Kind, SourceLocation KindLoc, - SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc); - - /// Called on well-formed 'at' clause. - OMPClause *ActOnOpenMPAtClause(OpenMPAtClauseKind Kind, - SourceLocation KindLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - - /// Called on well-formed 'severity' clause. - OMPClause *ActOnOpenMPSeverityClause(OpenMPSeverityClauseKind Kind, - SourceLocation KindLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - - /// Called on well-formed 'message' clause. - /// passing string for message. - OMPClause *ActOnOpenMPMessageClause(Expr *MS, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - - /// Data used for processing a list of variables in OpenMP clauses. - struct OpenMPVarListDataTy final { - Expr *DepModOrTailExpr = nullptr; - Expr *IteratorExpr = nullptr; - SourceLocation ColonLoc; - SourceLocation RLoc; - CXXScopeSpec ReductionOrMapperIdScopeSpec; - DeclarationNameInfo ReductionOrMapperId; - int ExtraModifier = -1; ///< Additional modifier for linear, map, depend or - ///< lastprivate clause. - SmallVector - MapTypeModifiers; - SmallVector - MapTypeModifiersLoc; - SmallVector - MotionModifiers; - SmallVector MotionModifiersLoc; - bool IsMapTypeImplicit = false; - SourceLocation ExtraModifierLoc; - SourceLocation OmpAllMemoryLoc; - SourceLocation - StepModifierLoc; /// 'step' modifier location for linear clause - }; - - OMPClause *ActOnOpenMPVarListClause(OpenMPClauseKind Kind, - ArrayRef Vars, - const OMPVarListLocTy &Locs, - OpenMPVarListDataTy &Data); - /// Called on well-formed 'inclusive' clause. - OMPClause *ActOnOpenMPInclusiveClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'exclusive' clause. - OMPClause *ActOnOpenMPExclusiveClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'allocate' clause. - OMPClause * - ActOnOpenMPAllocateClause(Expr *Allocator, ArrayRef VarList, - SourceLocation StartLoc, SourceLocation ColonLoc, - SourceLocation LParenLoc, SourceLocation EndLoc); - /// Called on well-formed 'private' clause. - OMPClause *ActOnOpenMPPrivateClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'firstprivate' clause. - OMPClause *ActOnOpenMPFirstprivateClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'lastprivate' clause. - OMPClause *ActOnOpenMPLastprivateClause( - ArrayRef VarList, OpenMPLastprivateModifier LPKind, - SourceLocation LPKindLoc, SourceLocation ColonLoc, - SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc); - /// Called on well-formed 'shared' clause. - OMPClause *ActOnOpenMPSharedClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'reduction' clause. - OMPClause *ActOnOpenMPReductionClause( - ArrayRef VarList, OpenMPReductionClauseModifier Modifier, - SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation ModifierLoc, SourceLocation ColonLoc, - SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec, - const DeclarationNameInfo &ReductionId, - ArrayRef UnresolvedReductions = std::nullopt); - /// Called on well-formed 'task_reduction' clause. - OMPClause *ActOnOpenMPTaskReductionClause( - ArrayRef VarList, SourceLocation StartLoc, - SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc, - CXXScopeSpec &ReductionIdScopeSpec, - const DeclarationNameInfo &ReductionId, - ArrayRef UnresolvedReductions = std::nullopt); - /// Called on well-formed 'in_reduction' clause. - OMPClause *ActOnOpenMPInReductionClause( - ArrayRef VarList, SourceLocation StartLoc, - SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc, - CXXScopeSpec &ReductionIdScopeSpec, - const DeclarationNameInfo &ReductionId, - ArrayRef UnresolvedReductions = std::nullopt); - /// Called on well-formed 'linear' clause. - OMPClause *ActOnOpenMPLinearClause( - ArrayRef VarList, Expr *Step, SourceLocation StartLoc, - SourceLocation LParenLoc, OpenMPLinearClauseKind LinKind, - SourceLocation LinLoc, SourceLocation ColonLoc, - SourceLocation StepModifierLoc, SourceLocation EndLoc); - /// Called on well-formed 'aligned' clause. - OMPClause *ActOnOpenMPAlignedClause(ArrayRef VarList, Expr *Alignment, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation ColonLoc, - SourceLocation EndLoc); - /// Called on well-formed 'copyin' clause. - OMPClause *ActOnOpenMPCopyinClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'copyprivate' clause. - OMPClause *ActOnOpenMPCopyprivateClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'flush' pseudo clause. - OMPClause *ActOnOpenMPFlushClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'depobj' pseudo clause. - OMPClause *ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'depend' clause. - OMPClause *ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, - Expr *DepModifier, - ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'device' clause. - OMPClause *ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier, - Expr *Device, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation ModifierLoc, - SourceLocation EndLoc); - /// Called on well-formed 'map' clause. - OMPClause *ActOnOpenMPMapClause( - Expr *IteratorModifier, ArrayRef MapTypeModifiers, - ArrayRef MapTypeModifiersLoc, - CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, - OpenMPMapClauseKind MapType, bool IsMapTypeImplicit, - SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef VarList, - const OMPVarListLocTy &Locs, bool NoDiagnose = false, - ArrayRef UnresolvedMappers = std::nullopt); - /// Called on well-formed 'num_teams' clause. - OMPClause *ActOnOpenMPNumTeamsClause(Expr *NumTeams, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'thread_limit' clause. - OMPClause *ActOnOpenMPThreadLimitClause(Expr *ThreadLimit, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'priority' clause. - OMPClause *ActOnOpenMPPriorityClause(Expr *Priority, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - /// Called on well-formed 'dist_schedule' clause. - OMPClause *ActOnOpenMPDistScheduleClause( - OpenMPDistScheduleClauseKind Kind, Expr *ChunkSize, - SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation KindLoc, - SourceLocation CommaLoc, SourceLocation EndLoc); - /// Called on well-formed 'defaultmap' clause. - OMPClause *ActOnOpenMPDefaultmapClause( - OpenMPDefaultmapClauseModifier M, OpenMPDefaultmapClauseKind Kind, - SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc, - SourceLocation KindLoc, SourceLocation EndLoc); - /// Called on well-formed 'to' clause. - OMPClause * - ActOnOpenMPToClause(ArrayRef MotionModifiers, - ArrayRef MotionModifiersLoc, - CXXScopeSpec &MapperIdScopeSpec, - DeclarationNameInfo &MapperId, SourceLocation ColonLoc, - ArrayRef VarList, const OMPVarListLocTy &Locs, - ArrayRef UnresolvedMappers = std::nullopt); - /// Called on well-formed 'from' clause. - OMPClause * - ActOnOpenMPFromClause(ArrayRef MotionModifiers, - ArrayRef MotionModifiersLoc, - CXXScopeSpec &MapperIdScopeSpec, - DeclarationNameInfo &MapperId, SourceLocation ColonLoc, - ArrayRef VarList, const OMPVarListLocTy &Locs, - ArrayRef UnresolvedMappers = std::nullopt); - /// Called on well-formed 'use_device_ptr' clause. - OMPClause *ActOnOpenMPUseDevicePtrClause(ArrayRef VarList, - const OMPVarListLocTy &Locs); - /// Called on well-formed 'use_device_addr' clause. - OMPClause *ActOnOpenMPUseDeviceAddrClause(ArrayRef VarList, - const OMPVarListLocTy &Locs); - /// Called on well-formed 'is_device_ptr' clause. - OMPClause *ActOnOpenMPIsDevicePtrClause(ArrayRef VarList, - const OMPVarListLocTy &Locs); - /// Called on well-formed 'has_device_addr' clause. - OMPClause *ActOnOpenMPHasDeviceAddrClause(ArrayRef VarList, - const OMPVarListLocTy &Locs); - /// Called on well-formed 'nontemporal' clause. - OMPClause *ActOnOpenMPNontemporalClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - - /// Data for list of allocators. - struct UsesAllocatorsData { - /// Allocator. - Expr *Allocator = nullptr; - /// Allocator traits. - Expr *AllocatorTraits = nullptr; - /// Locations of '(' and ')' symbols. - SourceLocation LParenLoc, RParenLoc; - }; - /// Called on well-formed 'uses_allocators' clause. - OMPClause *ActOnOpenMPUsesAllocatorClause(SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc, - ArrayRef Data); - /// Called on well-formed 'affinity' clause. - OMPClause *ActOnOpenMPAffinityClause(SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation ColonLoc, - SourceLocation EndLoc, Expr *Modifier, - ArrayRef Locators); - /// Called on a well-formed 'bind' clause. - OMPClause *ActOnOpenMPBindClause(OpenMPBindClauseKind Kind, - SourceLocation KindLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - - /// Called on a well-formed 'ompx_dyn_cgroup_mem' clause. - OMPClause *ActOnOpenMPXDynCGroupMemClause(Expr *Size, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - - /// Called on well-formed 'doacross' clause. - OMPClause * - ActOnOpenMPDoacrossClause(OpenMPDoacrossClauseModifier DepType, - SourceLocation DepLoc, SourceLocation ColonLoc, - ArrayRef VarList, SourceLocation StartLoc, - SourceLocation LParenLoc, SourceLocation EndLoc); - - /// Called on a well-formed 'ompx_attribute' clause. - OMPClause *ActOnOpenMPXAttributeClause(ArrayRef Attrs, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc); - - /// Called on a well-formed 'ompx_bare' clause. - OMPClause *ActOnOpenMPXBareClause(SourceLocation StartLoc, - SourceLocation EndLoc); - -private: - void *VarDataSharingAttributesStack; - - /// Number of nested '#pragma omp declare target' directives. - SmallVector DeclareTargetNesting; - - /// Initialization of data-sharing attributes stack. - void InitDataSharingAttributesStack(); - void DestroyDataSharingAttributesStack(); - - /// Returns OpenMP nesting level for current directive. - unsigned getOpenMPNestingLevel() const; - - /// Adjusts the function scopes index for the target-based regions. - void adjustOpenMPTargetScopeIndex(unsigned &FunctionScopesIndex, - unsigned Level) const; - - /// Returns the number of scopes associated with the construct on the given - /// OpenMP level. - int getNumberOfConstructScopes(unsigned Level) const; - - /// Push new OpenMP function region for non-capturing function. - void pushOpenMPFunctionRegion(); - - /// Pop OpenMP function region for non-capturing function. - void popOpenMPFunctionRegion(const sema::FunctionScopeInfo *OldFSI); - - /// Analyzes and checks a loop nest for use by a loop transformation. - /// - /// \param Kind The loop transformation directive kind. - /// \param NumLoops How many nested loops the directive is expecting. - /// \param AStmt Associated statement of the transformation directive. - /// \param LoopHelpers [out] The loop analysis result. - /// \param Body [out] The body code nested in \p NumLoops loop. - /// \param OriginalInits [out] Collection of statements and declarations that - /// must have been executed/declared before entering the - /// loop. - /// - /// \return Whether there was any error. - bool checkTransformableLoopNest( - OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops, - SmallVectorImpl &LoopHelpers, - Stmt *&Body, - SmallVectorImpl, 0>> - &OriginalInits); - - /// Helper to keep information about the current `omp begin/end declare - /// variant` nesting. - struct OMPDeclareVariantScope { - /// The associated OpenMP context selector. - OMPTraitInfo *TI; - - /// The associated OpenMP context selector mangling. - std::string NameSuffix; - - OMPDeclareVariantScope(OMPTraitInfo &TI); - }; - - /// Return the OMPTraitInfo for the surrounding scope, if any. - OMPTraitInfo *getOMPTraitInfoForSurroundingScope() { - return OMPDeclareVariantScopes.empty() ? nullptr - : OMPDeclareVariantScopes.back().TI; - } - - /// The current `omp begin/end declare variant` scopes. - SmallVector OMPDeclareVariantScopes; - - /// The current `omp begin/end assumes` scopes. - SmallVector OMPAssumeScoped; - - /// All `omp assumes` we encountered so far. - SmallVector OMPAssumeGlobal; - - /// OMPD_loop is mapped to OMPD_for, OMPD_distribute or OMPD_simd depending - /// on the parameter of the bind clause. In the methods for the - /// mapped directives, check the parameters of the lastprivate clause. - bool checkLastPrivateForMappedDirectives(ArrayRef Clauses); - /// Depending on the bind clause of OMPD_loop map the directive to new - /// directives. - /// 1) loop bind(parallel) --> OMPD_for - /// 2) loop bind(teams) --> OMPD_distribute - /// 3) loop bind(thread) --> OMPD_simd - /// This is being handled in Sema instead of Codegen because of the need for - /// rigorous semantic checking in the new mapped directives. - bool mapLoopConstruct(llvm::SmallVector &ClausesWithoutBind, - ArrayRef Clauses, - OpenMPBindClauseKind &BindKind, - OpenMPDirectiveKind &Kind, - OpenMPDirectiveKind &PrevMappedDirective, - SourceLocation StartLoc, SourceLocation EndLoc, - const DeclarationNameInfo &DirName, - OpenMPDirectiveKind CancelRegion); - - ///@} }; DeductionFailureInfo diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h new file mode 100644 index 00000000000000..9927459bbc5941 --- /dev/null +++ b/clang/include/clang/Sema/SemaOpenMP.h @@ -0,0 +1,1447 @@ +//===----- SemaOpenMP.h -- Semantic Analysis for OpenMP constructs -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares semantic analysis for OpenMP constructs and +/// clauses. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_SEMA_SEMAOPENMP_H +#define LLVM_CLANG_SEMA_SEMAOPENMP_H + +#include "clang/AST/Attr.h" +#include "clang/AST/Decl.h" +#include "clang/AST/DeclBase.h" +#include "clang/AST/DeclOpenMP.h" +#include "clang/AST/DeclarationName.h" +#include "clang/AST/Expr.h" +#include "clang/AST/ExprOpenMP.h" +#include "clang/AST/OpenMPClause.h" +#include "clang/AST/Stmt.h" +#include "clang/AST/StmtOpenMP.h" +#include "clang/AST/Type.h" +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/LLVM.h" +#include "clang/Basic/OpenMPKinds.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/Specifiers.h" +#include "clang/Sema/DeclSpec.h" +#include "clang/Sema/Ownership.h" +#include "clang/Sema/Scope.h" +#include "clang/Sema/ScopeInfo.h" +#include "clang/Sema/SemaBase.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PointerUnion.h" +#include +#include +#include + +namespace clang { + +class SemaOpenMP : public SemaBase { +public: + SemaOpenMP(Sema &S); + + friend class Parser; + friend class Sema; + + using DeclGroupPtrTy = OpaquePtr; + using CapturedParamNameType = std::pair; + + /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current + /// context is "used as device code". + /// + /// - If CurContext is a `declare target` function or it is known that the + /// function is emitted for the device, emits the diagnostics immediately. + /// - If CurContext is a non-`declare target` function and we are compiling + /// for the device, creates a diagnostic which is emitted if and when we + /// realize that the function will be codegen'ed. + /// + /// Example usage: + /// + /// // Variable-length arrays are not allowed in NVPTX device code. + /// if (diagIfOpenMPDeviceCode(Loc, diag::err_vla_unsupported)) + /// return ExprError(); + /// // Otherwise, continue parsing as normal. + SemaDiagnosticBuilder diagIfOpenMPDeviceCode(SourceLocation Loc, + unsigned DiagID, + const FunctionDecl *FD); + + /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current + /// context is "used as host code". + /// + /// - If CurContext is a `declare target` function or it is known that the + /// function is emitted for the host, emits the diagnostics immediately. + /// - If CurContext is a non-host function, just ignore it. + /// + /// Example usage: + /// + /// // Variable-length arrays are not allowed in NVPTX device code. + /// if (diagIfOpenMPHostode(Loc, diag::err_vla_unsupported)) + /// return ExprError(); + /// // Otherwise, continue parsing as normal. + SemaDiagnosticBuilder diagIfOpenMPHostCode(SourceLocation Loc, + unsigned DiagID, + const FunctionDecl *FD); + + /// The declarator \p D defines a function in the scope \p S which is nested + /// in an `omp begin/end declare variant` scope. In this method we create a + /// declaration for \p D and rename \p D according to the OpenMP context + /// selector of the surrounding scope. Return all base functions in \p Bases. + void ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( + Scope *S, Declarator &D, MultiTemplateParamsArg TemplateParameterLists, + SmallVectorImpl &Bases); + + /// Register \p D as specialization of all base functions in \p Bases in the + /// current `omp begin/end declare variant` scope. + void ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope( + Decl *D, SmallVectorImpl &Bases); + + /// Act on \p D, a function definition inside of an `omp [begin/end] assumes`. + void ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D); + + /// Can we exit an OpenMP declare variant scope at the moment. + bool isInOpenMPDeclareVariantScope() const { + return !OMPDeclareVariantScopes.empty(); + } + + ExprResult + VerifyPositiveIntegerConstantInClause(Expr *Op, OpenMPClauseKind CKind, + bool StrictlyPositive = true, + bool SuppressExprDiags = false); + + /// Given the potential call expression \p Call, determine if there is a + /// specialization via the OpenMP declare variant mechanism available. If + /// there is, return the specialized call expression, otherwise return the + /// original \p Call. + ExprResult ActOnOpenMPCall(ExprResult Call, Scope *Scope, + SourceLocation LParenLoc, MultiExprArg ArgExprs, + SourceLocation RParenLoc, Expr *ExecConfig); + + /// Handle a `omp begin declare variant`. + void ActOnOpenMPBeginDeclareVariant(SourceLocation Loc, OMPTraitInfo &TI); + + /// Handle a `omp end declare variant`. + void ActOnOpenMPEndDeclareVariant(); + + /// Function tries to capture lambda's captured variables in the OpenMP region + /// before the original lambda is captured. + void tryCaptureOpenMPLambdas(ValueDecl *V); + + /// Return true if the provided declaration \a VD should be captured by + /// reference. + /// \param Level Relative level of nested OpenMP construct for that the check + /// is performed. + /// \param OpenMPCaptureLevel Capture level within an OpenMP construct. + bool isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level, + unsigned OpenMPCaptureLevel) const; + + /// Check if the specified variable is used in one of the private + /// clauses (private, firstprivate, lastprivate, reduction etc.) in OpenMP + /// constructs. + VarDecl *isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo = false, + unsigned StopAt = 0); + + /// The member expression(this->fd) needs to be rebuilt in the template + /// instantiation to generate private copy for OpenMP when default + /// clause is used. The function will return true if default + /// cluse is used. + bool isOpenMPRebuildMemberExpr(ValueDecl *D); + + ExprResult getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK, + ExprObjectKind OK, SourceLocation Loc); + + /// If the current region is a loop-based region, mark the start of the loop + /// construct. + void startOpenMPLoop(); + + /// If the current region is a range loop-based region, mark the start of the + /// loop construct. + void startOpenMPCXXRangeFor(); + + /// Check if the specified variable is used in 'private' clause. + /// \param Level Relative level of nested OpenMP construct for that the check + /// is performed. + OpenMPClauseKind isOpenMPPrivateDecl(ValueDecl *D, unsigned Level, + unsigned CapLevel) const; + + /// Sets OpenMP capture kind (OMPC_private, OMPC_firstprivate, OMPC_map etc.) + /// for \p FD based on DSA for the provided corresponding captured declaration + /// \p D. + void setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D, unsigned Level); + + /// Check if the specified variable is captured by 'target' directive. + /// \param Level Relative level of nested OpenMP construct for that the check + /// is performed. + bool isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level, + unsigned CaptureLevel) const; + + /// Check if the specified global variable must be captured by outer capture + /// regions. + /// \param Level Relative level of nested OpenMP construct for that + /// the check is performed. + bool isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level, + unsigned CaptureLevel) const; + + ExprResult PerformOpenMPImplicitIntegerConversion(SourceLocation OpLoc, + Expr *Op); + /// Called on start of new data sharing attribute block. + void StartOpenMPDSABlock(OpenMPDirectiveKind K, + const DeclarationNameInfo &DirName, Scope *CurScope, + SourceLocation Loc); + /// Start analysis of clauses. + void StartOpenMPClause(OpenMPClauseKind K); + /// End analysis of clauses. + void EndOpenMPClause(); + /// Called on end of data sharing attribute block. + void EndOpenMPDSABlock(Stmt *CurDirective); + + /// Check if the current region is an OpenMP loop region and if it is, + /// mark loop control variable, used in \p Init for loop initialization, as + /// private by default. + /// \param Init First part of the for loop. + void ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init); + + /// Called on well-formed '\#pragma omp metadirective' after parsing + /// of the associated statement. + StmtResult ActOnOpenMPMetaDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + + // OpenMP directives and clauses. + /// Called on correct id-expression from the '#pragma omp + /// threadprivate'. + ExprResult ActOnOpenMPIdExpression(Scope *CurScope, CXXScopeSpec &ScopeSpec, + const DeclarationNameInfo &Id, + OpenMPDirectiveKind Kind); + /// Called on well-formed '#pragma omp threadprivate'. + DeclGroupPtrTy ActOnOpenMPThreadprivateDirective(SourceLocation Loc, + ArrayRef VarList); + /// Builds a new OpenMPThreadPrivateDecl and checks its correctness. + OMPThreadPrivateDecl *CheckOMPThreadPrivateDecl(SourceLocation Loc, + ArrayRef VarList); + /// Called on well-formed '#pragma omp allocate'. + DeclGroupPtrTy ActOnOpenMPAllocateDirective(SourceLocation Loc, + ArrayRef VarList, + ArrayRef Clauses, + DeclContext *Owner = nullptr); + + /// Called on well-formed '#pragma omp [begin] assume[s]'. + void ActOnOpenMPAssumesDirective(SourceLocation Loc, + OpenMPDirectiveKind DKind, + ArrayRef Assumptions, + bool SkippedClauses); + + /// Check if there is an active global `omp begin assumes` directive. + bool isInOpenMPAssumeScope() const { return !OMPAssumeScoped.empty(); } + + /// Check if there is an active global `omp assumes` directive. + bool hasGlobalOpenMPAssumes() const { return !OMPAssumeGlobal.empty(); } + + /// Called on well-formed '#pragma omp end assumes'. + void ActOnOpenMPEndAssumesDirective(); + + /// Called on well-formed '#pragma omp requires'. + DeclGroupPtrTy ActOnOpenMPRequiresDirective(SourceLocation Loc, + ArrayRef ClauseList); + /// Check restrictions on Requires directive + OMPRequiresDecl *CheckOMPRequiresDecl(SourceLocation Loc, + ArrayRef Clauses); + /// Check if the specified type is allowed to be used in 'omp declare + /// reduction' construct. + QualType ActOnOpenMPDeclareReductionType(SourceLocation TyLoc, + TypeResult ParsedType); + /// Called on start of '#pragma omp declare reduction'. + DeclGroupPtrTy ActOnOpenMPDeclareReductionDirectiveStart( + Scope *S, DeclContext *DC, DeclarationName Name, + ArrayRef> ReductionTypes, + AccessSpecifier AS, Decl *PrevDeclInScope = nullptr); + /// Initialize declare reduction construct initializer. + void ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D); + /// Finish current declare reduction construct initializer. + void ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner); + /// Initialize declare reduction construct initializer. + /// \return omp_priv variable. + VarDecl *ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D); + /// Finish current declare reduction construct initializer. + void ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer, + VarDecl *OmpPrivParm); + /// Called at the end of '#pragma omp declare reduction'. + DeclGroupPtrTy ActOnOpenMPDeclareReductionDirectiveEnd( + Scope *S, DeclGroupPtrTy DeclReductions, bool IsValid); + + /// Check variable declaration in 'omp declare mapper' construct. + TypeResult ActOnOpenMPDeclareMapperVarDecl(Scope *S, Declarator &D); + /// Check if the specified type is allowed to be used in 'omp declare + /// mapper' construct. + QualType ActOnOpenMPDeclareMapperType(SourceLocation TyLoc, + TypeResult ParsedType); + /// Called on start of '#pragma omp declare mapper'. + DeclGroupPtrTy ActOnOpenMPDeclareMapperDirective( + Scope *S, DeclContext *DC, DeclarationName Name, QualType MapperType, + SourceLocation StartLoc, DeclarationName VN, AccessSpecifier AS, + Expr *MapperVarRef, ArrayRef Clauses, + Decl *PrevDeclInScope = nullptr); + /// Build the mapper variable of '#pragma omp declare mapper'. + ExprResult ActOnOpenMPDeclareMapperDirectiveVarDecl(Scope *S, + QualType MapperType, + SourceLocation StartLoc, + DeclarationName VN); + void ActOnOpenMPIteratorVarDecl(VarDecl *VD); + bool isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const; + const ValueDecl *getOpenMPDeclareMapperVarName() const; + + struct DeclareTargetContextInfo { + struct MapInfo { + OMPDeclareTargetDeclAttr::MapTypeTy MT; + SourceLocation Loc; + }; + /// Explicitly listed variables and functions in a 'to' or 'link' clause. + llvm::DenseMap ExplicitlyMapped; + + /// The 'device_type' as parsed from the clause. + OMPDeclareTargetDeclAttr::DevTypeTy DT = OMPDeclareTargetDeclAttr::DT_Any; + + /// The directive kind, `begin declare target` or `declare target`. + OpenMPDirectiveKind Kind; + + /// The directive with indirect clause. + std::optional Indirect; + + /// The directive location. + SourceLocation Loc; + + DeclareTargetContextInfo(OpenMPDirectiveKind Kind, SourceLocation Loc) + : Kind(Kind), Loc(Loc) {} + }; + + /// Called on the start of target region i.e. '#pragma omp declare target'. + bool ActOnStartOpenMPDeclareTargetContext(DeclareTargetContextInfo &DTCI); + + /// Called at the end of target region i.e. '#pragma omp end declare target'. + const DeclareTargetContextInfo ActOnOpenMPEndDeclareTargetDirective(); + + /// Called once a target context is completed, that can be when a + /// '#pragma omp end declare target' was encountered or when a + /// '#pragma omp declare target' without declaration-definition-seq was + /// encountered. + void ActOnFinishedOpenMPDeclareTargetContext(DeclareTargetContextInfo &DTCI); + + /// Report unterminated 'omp declare target' or 'omp begin declare target' at + /// the end of a compilation unit. + void DiagnoseUnterminatedOpenMPDeclareTarget(); + + /// Searches for the provided declaration name for OpenMP declare target + /// directive. + NamedDecl *lookupOpenMPDeclareTargetName(Scope *CurScope, + CXXScopeSpec &ScopeSpec, + const DeclarationNameInfo &Id); + + /// Called on correct id-expression from the '#pragma omp declare target'. + void ActOnOpenMPDeclareTargetName(NamedDecl *ND, SourceLocation Loc, + OMPDeclareTargetDeclAttr::MapTypeTy MT, + DeclareTargetContextInfo &DTCI); + + /// Check declaration inside target region. + void + checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, + SourceLocation IdLoc = SourceLocation()); + + /// Adds OMPDeclareTargetDeclAttr to referenced variables in declare target + /// directive. + void ActOnOpenMPDeclareTargetInitializer(Decl *D); + + /// Finishes analysis of the deferred functions calls that may be declared as + /// host/nohost during device/host compilation. + void finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller, + const FunctionDecl *Callee, + SourceLocation Loc); + + /// Return true if currently in OpenMP task with untied clause context. + bool isInOpenMPTaskUntiedContext() const; + + /// Return true inside OpenMP declare target region. + bool isInOpenMPDeclareTargetContext() const { + return !DeclareTargetNesting.empty(); + } + /// Return true inside OpenMP target region. + bool isInOpenMPTargetExecutionDirective() const; + + /// Return the number of captured regions created for an OpenMP directive. + static int getOpenMPCaptureLevels(OpenMPDirectiveKind Kind); + + /// Initialization of captured region for OpenMP region. + void ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope); + + /// Called for syntactical loops (ForStmt or CXXForRangeStmt) associated to + /// an OpenMP loop directive. + StmtResult ActOnOpenMPCanonicalLoop(Stmt *AStmt); + + /// Process a canonical OpenMP loop nest that can either be a canonical + /// literal loop (ForStmt or CXXForRangeStmt), or the generated loop of an + /// OpenMP loop transformation construct. + StmtResult ActOnOpenMPLoopnest(Stmt *AStmt); + + /// End of OpenMP region. + /// + /// \param S Statement associated with the current OpenMP region. + /// \param Clauses List of clauses for the current OpenMP region. + /// + /// \returns Statement for finished OpenMP region. + StmtResult ActOnOpenMPRegionEnd(StmtResult S, ArrayRef Clauses); + StmtResult ActOnOpenMPExecutableDirective( + OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName, + OpenMPDirectiveKind CancelRegion, ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, + OpenMPDirectiveKind PrevMappedDirective = llvm::omp::OMPD_unknown); + /// Called on well-formed '\#pragma omp parallel' after parsing + /// of the associated statement. + StmtResult ActOnOpenMPParallelDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + using VarsWithInheritedDSAType = + llvm::SmallDenseMap; + /// Called on well-formed '\#pragma omp simd' after parsing + /// of the associated statement. + StmtResult + ActOnOpenMPSimdDirective(ArrayRef Clauses, Stmt *AStmt, + SourceLocation StartLoc, SourceLocation EndLoc, + VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '#pragma omp tile' after parsing of its clauses and + /// the associated statement. + StmtResult ActOnOpenMPTileDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '#pragma omp unroll' after parsing of its clauses + /// and the associated statement. + StmtResult ActOnOpenMPUnrollDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp for' after parsing + /// of the associated statement. + StmtResult + ActOnOpenMPForDirective(ArrayRef Clauses, Stmt *AStmt, + SourceLocation StartLoc, SourceLocation EndLoc, + VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp for simd' after parsing + /// of the associated statement. + StmtResult + ActOnOpenMPForSimdDirective(ArrayRef Clauses, Stmt *AStmt, + SourceLocation StartLoc, SourceLocation EndLoc, + VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp sections' after parsing + /// of the associated statement. + StmtResult ActOnOpenMPSectionsDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp section' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPSectionDirective(Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp scope' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPScopeDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp single' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPSingleDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp master' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPMasterDirective(Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp critical' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPCriticalDirective(const DeclarationNameInfo &DirName, + ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp parallel for' after parsing + /// of the associated statement. + StmtResult ActOnOpenMPParallelForDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp parallel for simd' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPParallelForSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp parallel master' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPParallelMasterDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp parallel masked' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPParallelMaskedDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp parallel sections' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPParallelSectionsDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp task' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPTaskDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp taskyield'. + StmtResult ActOnOpenMPTaskyieldDirective(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp error'. + /// Error direcitive is allowed in both declared and excutable contexts. + /// Adding InExContext to identify which context is called from. + StmtResult ActOnOpenMPErrorDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc, + bool InExContext = true); + /// Called on well-formed '\#pragma omp barrier'. + StmtResult ActOnOpenMPBarrierDirective(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp taskwait'. + StmtResult ActOnOpenMPTaskwaitDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp taskgroup'. + StmtResult ActOnOpenMPTaskgroupDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp flush'. + StmtResult ActOnOpenMPFlushDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp depobj'. + StmtResult ActOnOpenMPDepobjDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp scan'. + StmtResult ActOnOpenMPScanDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp ordered' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPOrderedDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp atomic' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPAtomicDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp target' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPTargetDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp target data' after parsing of + /// the associated statement. + StmtResult ActOnOpenMPTargetDataDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp target enter data' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPTargetEnterDataDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc, + Stmt *AStmt); + /// Called on well-formed '\#pragma omp target exit data' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPTargetExitDataDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc, + Stmt *AStmt); + /// Called on well-formed '\#pragma omp target parallel' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPTargetParallelDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp target parallel for' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPTargetParallelForDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp teams' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPTeamsDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp teams loop' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPTeamsGenericLoopDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp target teams loop' after parsing of + /// the associated statement. + StmtResult ActOnOpenMPTargetTeamsGenericLoopDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp parallel loop' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPParallelGenericLoopDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp target parallel loop' after parsing + /// of the associated statement. + StmtResult ActOnOpenMPTargetParallelGenericLoopDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp cancellation point'. + StmtResult + ActOnOpenMPCancellationPointDirective(SourceLocation StartLoc, + SourceLocation EndLoc, + OpenMPDirectiveKind CancelRegion); + /// Called on well-formed '\#pragma omp cancel'. + StmtResult ActOnOpenMPCancelDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc, + OpenMPDirectiveKind CancelRegion); + /// Called on well-formed '\#pragma omp taskloop' after parsing of the + /// associated statement. + StmtResult + ActOnOpenMPTaskLoopDirective(ArrayRef Clauses, Stmt *AStmt, + SourceLocation StartLoc, SourceLocation EndLoc, + VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp taskloop simd' after parsing of + /// the associated statement. + StmtResult ActOnOpenMPTaskLoopSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp master taskloop' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPMasterTaskLoopDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp master taskloop simd' after parsing of + /// the associated statement. + StmtResult ActOnOpenMPMasterTaskLoopSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp parallel master taskloop' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPParallelMasterTaskLoopDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp parallel master taskloop simd' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPParallelMasterTaskLoopSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp masked taskloop' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPMaskedTaskLoopDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp masked taskloop simd' after parsing of + /// the associated statement. + StmtResult ActOnOpenMPMaskedTaskLoopSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp parallel masked taskloop' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPParallelMaskedTaskLoopDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp parallel masked taskloop simd' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPParallelMaskedTaskLoopSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp distribute' after parsing + /// of the associated statement. + StmtResult + ActOnOpenMPDistributeDirective(ArrayRef Clauses, Stmt *AStmt, + SourceLocation StartLoc, SourceLocation EndLoc, + VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp target update'. + StmtResult ActOnOpenMPTargetUpdateDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc, + Stmt *AStmt); + /// Called on well-formed '\#pragma omp distribute parallel for' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPDistributeParallelForDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp distribute parallel for simd' + /// after parsing of the associated statement. + StmtResult ActOnOpenMPDistributeParallelForSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp distribute simd' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPDistributeSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp target parallel for simd' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPTargetParallelForSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp target simd' after parsing of + /// the associated statement. + StmtResult + ActOnOpenMPTargetSimdDirective(ArrayRef Clauses, Stmt *AStmt, + SourceLocation StartLoc, SourceLocation EndLoc, + VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp teams distribute' after parsing of + /// the associated statement. + StmtResult ActOnOpenMPTeamsDistributeDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp teams distribute simd' after parsing + /// of the associated statement. + StmtResult ActOnOpenMPTeamsDistributeSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp teams distribute parallel for simd' + /// after parsing of the associated statement. + StmtResult ActOnOpenMPTeamsDistributeParallelForSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp teams distribute parallel for' + /// after parsing of the associated statement. + StmtResult ActOnOpenMPTeamsDistributeParallelForDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp target teams' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp target teams distribute' after parsing + /// of the associated statement. + StmtResult ActOnOpenMPTargetTeamsDistributeDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp target teams distribute parallel for' + /// after parsing of the associated statement. + StmtResult ActOnOpenMPTargetTeamsDistributeParallelForDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp target teams distribute parallel for + /// simd' after parsing of the associated statement. + StmtResult ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp target teams distribute simd' after + /// parsing of the associated statement. + StmtResult ActOnOpenMPTargetTeamsDistributeSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + /// Called on well-formed '\#pragma omp interop'. + StmtResult ActOnOpenMPInteropDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp dispatch' after parsing of the + // /associated statement. + StmtResult ActOnOpenMPDispatchDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp masked' after parsing of the + // /associated statement. + StmtResult ActOnOpenMPMaskedDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + + /// Called on well-formed '\#pragma omp loop' after parsing of the + /// associated statement. + StmtResult ActOnOpenMPGenericLoopDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA); + + /// Checks correctness of linear modifiers. + bool CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind, + SourceLocation LinLoc); + /// Checks that the specified declaration matches requirements for the linear + /// decls. + bool CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc, + OpenMPLinearClauseKind LinKind, QualType Type, + bool IsDeclareSimd = false); + + /// Called on well-formed '\#pragma omp declare simd' after parsing of + /// the associated method/function. + DeclGroupPtrTy ActOnOpenMPDeclareSimdDirective( + DeclGroupPtrTy DG, OMPDeclareSimdDeclAttr::BranchStateTy BS, + Expr *Simdlen, ArrayRef Uniforms, ArrayRef Aligneds, + ArrayRef Alignments, ArrayRef Linears, + ArrayRef LinModifiers, ArrayRef Steps, SourceRange SR); + + /// Checks '\#pragma omp declare variant' variant function and original + /// functions after parsing of the associated method/function. + /// \param DG Function declaration to which declare variant directive is + /// applied to. + /// \param VariantRef Expression that references the variant function, which + /// must be used instead of the original one, specified in \p DG. + /// \param TI The trait info object representing the match clause. + /// \param NumAppendArgs The number of omp_interop_t arguments to account for + /// in checking. + /// \returns std::nullopt, if the function/variant function are not compatible + /// with the pragma, pair of original function/variant ref expression + /// otherwise. + std::optional> + checkOpenMPDeclareVariantFunction(DeclGroupPtrTy DG, Expr *VariantRef, + OMPTraitInfo &TI, unsigned NumAppendArgs, + SourceRange SR); + + /// Called on well-formed '\#pragma omp declare variant' after parsing of + /// the associated method/function. + /// \param FD Function declaration to which declare variant directive is + /// applied to. + /// \param VariantRef Expression that references the variant function, which + /// must be used instead of the original one, specified in \p DG. + /// \param TI The context traits associated with the function variant. + /// \param AdjustArgsNothing The list of 'nothing' arguments. + /// \param AdjustArgsNeedDevicePtr The list of 'need_device_ptr' arguments. + /// \param AppendArgs The list of 'append_args' arguments. + /// \param AdjustArgsLoc The Location of an 'adjust_args' clause. + /// \param AppendArgsLoc The Location of an 'append_args' clause. + /// \param SR The SourceRange of the 'declare variant' directive. + void ActOnOpenMPDeclareVariantDirective( + FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI, + ArrayRef AdjustArgsNothing, + ArrayRef AdjustArgsNeedDevicePtr, + ArrayRef AppendArgs, SourceLocation AdjustArgsLoc, + SourceLocation AppendArgsLoc, SourceRange SR); + + OMPClause *ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'allocator' clause. + OMPClause *ActOnOpenMPAllocatorClause(Expr *Allocator, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'if' clause. + OMPClause *ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier, + Expr *Condition, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation NameModifierLoc, + SourceLocation ColonLoc, + SourceLocation EndLoc); + /// Called on well-formed 'final' clause. + OMPClause *ActOnOpenMPFinalClause(Expr *Condition, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'num_threads' clause. + OMPClause *ActOnOpenMPNumThreadsClause(Expr *NumThreads, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'align' clause. + OMPClause *ActOnOpenMPAlignClause(Expr *Alignment, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'safelen' clause. + OMPClause *ActOnOpenMPSafelenClause(Expr *Length, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'simdlen' clause. + OMPClause *ActOnOpenMPSimdlenClause(Expr *Length, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-form 'sizes' clause. + OMPClause *ActOnOpenMPSizesClause(ArrayRef SizeExprs, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-form 'full' clauses. + OMPClause *ActOnOpenMPFullClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-form 'partial' clauses. + OMPClause *ActOnOpenMPPartialClause(Expr *FactorExpr, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'collapse' clause. + OMPClause *ActOnOpenMPCollapseClause(Expr *NumForLoops, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'ordered' clause. + OMPClause * + ActOnOpenMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc, + SourceLocation LParenLoc = SourceLocation(), + Expr *NumForLoops = nullptr); + /// Called on well-formed 'grainsize' clause. + OMPClause *ActOnOpenMPGrainsizeClause(OpenMPGrainsizeClauseModifier Modifier, + Expr *Size, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation ModifierLoc, + SourceLocation EndLoc); + /// Called on well-formed 'num_tasks' clause. + OMPClause *ActOnOpenMPNumTasksClause(OpenMPNumTasksClauseModifier Modifier, + Expr *NumTasks, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation ModifierLoc, + SourceLocation EndLoc); + /// Called on well-formed 'hint' clause. + OMPClause *ActOnOpenMPHintClause(Expr *Hint, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'detach' clause. + OMPClause *ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + + OMPClause *ActOnOpenMPSimpleClause(OpenMPClauseKind Kind, unsigned Argument, + SourceLocation ArgumentLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'when' clause. + OMPClause *ActOnOpenMPWhenClause(OMPTraitInfo &TI, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'default' clause. + OMPClause *ActOnOpenMPDefaultClause(llvm::omp::DefaultKind Kind, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'proc_bind' clause. + OMPClause *ActOnOpenMPProcBindClause(llvm::omp::ProcBindKind Kind, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'order' clause. + OMPClause *ActOnOpenMPOrderClause(OpenMPOrderClauseModifier Modifier, + OpenMPOrderClauseKind Kind, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation MLoc, SourceLocation KindLoc, + SourceLocation EndLoc); + /// Called on well-formed 'update' clause. + OMPClause *ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + + OMPClause *ActOnOpenMPSingleExprWithArgClause( + OpenMPClauseKind Kind, ArrayRef Arguments, Expr *Expr, + SourceLocation StartLoc, SourceLocation LParenLoc, + ArrayRef ArgumentsLoc, SourceLocation DelimLoc, + SourceLocation EndLoc); + /// Called on well-formed 'schedule' clause. + OMPClause *ActOnOpenMPScheduleClause( + OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2, + OpenMPScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc, + SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc); + + OMPClause *ActOnOpenMPClause(OpenMPClauseKind Kind, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'nowait' clause. + OMPClause *ActOnOpenMPNowaitClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'untied' clause. + OMPClause *ActOnOpenMPUntiedClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'mergeable' clause. + OMPClause *ActOnOpenMPMergeableClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'read' clause. + OMPClause *ActOnOpenMPReadClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'write' clause. + OMPClause *ActOnOpenMPWriteClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'update' clause. + OMPClause *ActOnOpenMPUpdateClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'capture' clause. + OMPClause *ActOnOpenMPCaptureClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'compare' clause. + OMPClause *ActOnOpenMPCompareClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'fail' clause. + OMPClause *ActOnOpenMPFailClause(SourceLocation StartLoc, + SourceLocation EndLoc); + OMPClause *ActOnOpenMPFailClause(OpenMPClauseKind Kind, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + + /// Called on well-formed 'seq_cst' clause. + OMPClause *ActOnOpenMPSeqCstClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'acq_rel' clause. + OMPClause *ActOnOpenMPAcqRelClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'acquire' clause. + OMPClause *ActOnOpenMPAcquireClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'release' clause. + OMPClause *ActOnOpenMPReleaseClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'relaxed' clause. + OMPClause *ActOnOpenMPRelaxedClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'weak' clause. + OMPClause *ActOnOpenMPWeakClause(SourceLocation StartLoc, + SourceLocation EndLoc); + + /// Called on well-formed 'init' clause. + OMPClause * + ActOnOpenMPInitClause(Expr *InteropVar, OMPInteropInfo &InteropInfo, + SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation VarLoc, SourceLocation EndLoc); + + /// Called on well-formed 'use' clause. + OMPClause *ActOnOpenMPUseClause(Expr *InteropVar, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation VarLoc, SourceLocation EndLoc); + + /// Called on well-formed 'destroy' clause. + OMPClause *ActOnOpenMPDestroyClause(Expr *InteropVar, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation VarLoc, + SourceLocation EndLoc); + /// Called on well-formed 'novariants' clause. + OMPClause *ActOnOpenMPNovariantsClause(Expr *Condition, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'nocontext' clause. + OMPClause *ActOnOpenMPNocontextClause(Expr *Condition, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'filter' clause. + OMPClause *ActOnOpenMPFilterClause(Expr *ThreadID, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'threads' clause. + OMPClause *ActOnOpenMPThreadsClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'simd' clause. + OMPClause *ActOnOpenMPSIMDClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'nogroup' clause. + OMPClause *ActOnOpenMPNogroupClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed 'unified_address' clause. + OMPClause *ActOnOpenMPUnifiedAddressClause(SourceLocation StartLoc, + SourceLocation EndLoc); + + /// Called on well-formed 'unified_address' clause. + OMPClause *ActOnOpenMPUnifiedSharedMemoryClause(SourceLocation StartLoc, + SourceLocation EndLoc); + + /// Called on well-formed 'reverse_offload' clause. + OMPClause *ActOnOpenMPReverseOffloadClause(SourceLocation StartLoc, + SourceLocation EndLoc); + + /// Called on well-formed 'dynamic_allocators' clause. + OMPClause *ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc, + SourceLocation EndLoc); + + /// Called on well-formed 'atomic_default_mem_order' clause. + OMPClause *ActOnOpenMPAtomicDefaultMemOrderClause( + OpenMPAtomicDefaultMemOrderClauseKind Kind, SourceLocation KindLoc, + SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc); + + /// Called on well-formed 'at' clause. + OMPClause *ActOnOpenMPAtClause(OpenMPAtClauseKind Kind, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + + /// Called on well-formed 'severity' clause. + OMPClause *ActOnOpenMPSeverityClause(OpenMPSeverityClauseKind Kind, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + + /// Called on well-formed 'message' clause. + /// passing string for message. + OMPClause *ActOnOpenMPMessageClause(Expr *MS, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + + /// Data used for processing a list of variables in OpenMP clauses. + struct OpenMPVarListDataTy final { + Expr *DepModOrTailExpr = nullptr; + Expr *IteratorExpr = nullptr; + SourceLocation ColonLoc; + SourceLocation RLoc; + CXXScopeSpec ReductionOrMapperIdScopeSpec; + DeclarationNameInfo ReductionOrMapperId; + int ExtraModifier = -1; ///< Additional modifier for linear, map, depend or + ///< lastprivate clause. + SmallVector + MapTypeModifiers; + SmallVector + MapTypeModifiersLoc; + SmallVector + MotionModifiers; + SmallVector MotionModifiersLoc; + bool IsMapTypeImplicit = false; + SourceLocation ExtraModifierLoc; + SourceLocation OmpAllMemoryLoc; + SourceLocation + StepModifierLoc; /// 'step' modifier location for linear clause + }; + + OMPClause *ActOnOpenMPVarListClause(OpenMPClauseKind Kind, + ArrayRef Vars, + const OMPVarListLocTy &Locs, + OpenMPVarListDataTy &Data); + /// Called on well-formed 'inclusive' clause. + OMPClause *ActOnOpenMPInclusiveClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'exclusive' clause. + OMPClause *ActOnOpenMPExclusiveClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'allocate' clause. + OMPClause * + ActOnOpenMPAllocateClause(Expr *Allocator, ArrayRef VarList, + SourceLocation StartLoc, SourceLocation ColonLoc, + SourceLocation LParenLoc, SourceLocation EndLoc); + /// Called on well-formed 'private' clause. + OMPClause *ActOnOpenMPPrivateClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'firstprivate' clause. + OMPClause *ActOnOpenMPFirstprivateClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'lastprivate' clause. + OMPClause *ActOnOpenMPLastprivateClause( + ArrayRef VarList, OpenMPLastprivateModifier LPKind, + SourceLocation LPKindLoc, SourceLocation ColonLoc, + SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc); + /// Called on well-formed 'shared' clause. + OMPClause *ActOnOpenMPSharedClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'reduction' clause. + OMPClause *ActOnOpenMPReductionClause( + ArrayRef VarList, OpenMPReductionClauseModifier Modifier, + SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation ModifierLoc, SourceLocation ColonLoc, + SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec, + const DeclarationNameInfo &ReductionId, + ArrayRef UnresolvedReductions = std::nullopt); + /// Called on well-formed 'task_reduction' clause. + OMPClause *ActOnOpenMPTaskReductionClause( + ArrayRef VarList, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc, + CXXScopeSpec &ReductionIdScopeSpec, + const DeclarationNameInfo &ReductionId, + ArrayRef UnresolvedReductions = std::nullopt); + /// Called on well-formed 'in_reduction' clause. + OMPClause *ActOnOpenMPInReductionClause( + ArrayRef VarList, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc, + CXXScopeSpec &ReductionIdScopeSpec, + const DeclarationNameInfo &ReductionId, + ArrayRef UnresolvedReductions = std::nullopt); + /// Called on well-formed 'linear' clause. + OMPClause *ActOnOpenMPLinearClause( + ArrayRef VarList, Expr *Step, SourceLocation StartLoc, + SourceLocation LParenLoc, OpenMPLinearClauseKind LinKind, + SourceLocation LinLoc, SourceLocation ColonLoc, + SourceLocation StepModifierLoc, SourceLocation EndLoc); + /// Called on well-formed 'aligned' clause. + OMPClause *ActOnOpenMPAlignedClause(ArrayRef VarList, Expr *Alignment, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation ColonLoc, + SourceLocation EndLoc); + /// Called on well-formed 'copyin' clause. + OMPClause *ActOnOpenMPCopyinClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'copyprivate' clause. + OMPClause *ActOnOpenMPCopyprivateClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'flush' pseudo clause. + OMPClause *ActOnOpenMPFlushClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'depobj' pseudo clause. + OMPClause *ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'depend' clause. + OMPClause *ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, + Expr *DepModifier, + ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'device' clause. + OMPClause *ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier, + Expr *Device, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation ModifierLoc, + SourceLocation EndLoc); + /// Called on well-formed 'map' clause. + OMPClause *ActOnOpenMPMapClause( + Expr *IteratorModifier, ArrayRef MapTypeModifiers, + ArrayRef MapTypeModifiersLoc, + CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, + OpenMPMapClauseKind MapType, bool IsMapTypeImplicit, + SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef VarList, + const OMPVarListLocTy &Locs, bool NoDiagnose = false, + ArrayRef UnresolvedMappers = std::nullopt); + /// Called on well-formed 'num_teams' clause. + OMPClause *ActOnOpenMPNumTeamsClause(Expr *NumTeams, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'thread_limit' clause. + OMPClause *ActOnOpenMPThreadLimitClause(Expr *ThreadLimit, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'priority' clause. + OMPClause *ActOnOpenMPPriorityClause(Expr *Priority, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'dist_schedule' clause. + OMPClause *ActOnOpenMPDistScheduleClause( + OpenMPDistScheduleClauseKind Kind, Expr *ChunkSize, + SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation KindLoc, + SourceLocation CommaLoc, SourceLocation EndLoc); + /// Called on well-formed 'defaultmap' clause. + OMPClause *ActOnOpenMPDefaultmapClause( + OpenMPDefaultmapClauseModifier M, OpenMPDefaultmapClauseKind Kind, + SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc, + SourceLocation KindLoc, SourceLocation EndLoc); + /// Called on well-formed 'to' clause. + OMPClause * + ActOnOpenMPToClause(ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, + CXXScopeSpec &MapperIdScopeSpec, + DeclarationNameInfo &MapperId, SourceLocation ColonLoc, + ArrayRef VarList, const OMPVarListLocTy &Locs, + ArrayRef UnresolvedMappers = std::nullopt); + /// Called on well-formed 'from' clause. + OMPClause * + ActOnOpenMPFromClause(ArrayRef MotionModifiers, + ArrayRef MotionModifiersLoc, + CXXScopeSpec &MapperIdScopeSpec, + DeclarationNameInfo &MapperId, SourceLocation ColonLoc, + ArrayRef VarList, const OMPVarListLocTy &Locs, + ArrayRef UnresolvedMappers = std::nullopt); + /// Called on well-formed 'use_device_ptr' clause. + OMPClause *ActOnOpenMPUseDevicePtrClause(ArrayRef VarList, + const OMPVarListLocTy &Locs); + /// Called on well-formed 'use_device_addr' clause. + OMPClause *ActOnOpenMPUseDeviceAddrClause(ArrayRef VarList, + const OMPVarListLocTy &Locs); + /// Called on well-formed 'is_device_ptr' clause. + OMPClause *ActOnOpenMPIsDevicePtrClause(ArrayRef VarList, + const OMPVarListLocTy &Locs); + /// Called on well-formed 'has_device_addr' clause. + OMPClause *ActOnOpenMPHasDeviceAddrClause(ArrayRef VarList, + const OMPVarListLocTy &Locs); + /// Called on well-formed 'nontemporal' clause. + OMPClause *ActOnOpenMPNontemporalClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + + /// Data for list of allocators. + struct UsesAllocatorsData { + /// Allocator. + Expr *Allocator = nullptr; + /// Allocator traits. + Expr *AllocatorTraits = nullptr; + /// Locations of '(' and ')' symbols. + SourceLocation LParenLoc, RParenLoc; + }; + /// Called on well-formed 'uses_allocators' clause. + OMPClause *ActOnOpenMPUsesAllocatorClause(SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc, + ArrayRef Data); + /// Called on well-formed 'affinity' clause. + OMPClause *ActOnOpenMPAffinityClause(SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation ColonLoc, + SourceLocation EndLoc, Expr *Modifier, + ArrayRef Locators); + /// Called on a well-formed 'bind' clause. + OMPClause *ActOnOpenMPBindClause(OpenMPBindClauseKind Kind, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + + /// Called on a well-formed 'ompx_dyn_cgroup_mem' clause. + OMPClause *ActOnOpenMPXDynCGroupMemClause(Expr *Size, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + + /// Called on well-formed 'doacross' clause. + OMPClause * + ActOnOpenMPDoacrossClause(OpenMPDoacrossClauseModifier DepType, + SourceLocation DepLoc, SourceLocation ColonLoc, + ArrayRef VarList, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation EndLoc); + + /// Called on a well-formed 'ompx_attribute' clause. + OMPClause *ActOnOpenMPXAttributeClause(ArrayRef Attrs, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + + /// Called on a well-formed 'ompx_bare' clause. + OMPClause *ActOnOpenMPXBareClause(SourceLocation StartLoc, + SourceLocation EndLoc); + + ExprResult ActOnOMPArraySectionExpr(Expr *Base, SourceLocation LBLoc, + Expr *LowerBound, + SourceLocation ColonLocFirst, + SourceLocation ColonLocSecond, + Expr *Length, Expr *Stride, + SourceLocation RBLoc); + ExprResult ActOnOMPArrayShapingExpr(Expr *Base, SourceLocation LParenLoc, + SourceLocation RParenLoc, + ArrayRef Dims, + ArrayRef Brackets); + + /// Data structure for iterator expression. + struct OMPIteratorData { + IdentifierInfo *DeclIdent = nullptr; + SourceLocation DeclIdentLoc; + ParsedType Type; + OMPIteratorExpr::IteratorRange Range; + SourceLocation AssignLoc; + SourceLocation ColonLoc; + SourceLocation SecColonLoc; + }; + + ExprResult ActOnOMPIteratorExpr(Scope *S, SourceLocation IteratorKwLoc, + SourceLocation LLoc, SourceLocation RLoc, + ArrayRef Data); + +private: + void *VarDataSharingAttributesStack; + + /// Number of nested '#pragma omp declare target' directives. + SmallVector DeclareTargetNesting; + + /// Initialization of data-sharing attributes stack. + void InitDataSharingAttributesStack(); + void DestroyDataSharingAttributesStack(); + + /// Returns OpenMP nesting level for current directive. + unsigned getOpenMPNestingLevel() const; + + /// Adjusts the function scopes index for the target-based regions. + void adjustOpenMPTargetScopeIndex(unsigned &FunctionScopesIndex, + unsigned Level) const; + + /// Returns the number of scopes associated with the construct on the given + /// OpenMP level. + int getNumberOfConstructScopes(unsigned Level) const; + + /// Push new OpenMP function region for non-capturing function. + void pushOpenMPFunctionRegion(); + + /// Pop OpenMP function region for non-capturing function. + void popOpenMPFunctionRegion(const sema::FunctionScopeInfo *OldFSI); + + /// Analyzes and checks a loop nest for use by a loop transformation. + /// + /// \param Kind The loop transformation directive kind. + /// \param NumLoops How many nested loops the directive is expecting. + /// \param AStmt Associated statement of the transformation directive. + /// \param LoopHelpers [out] The loop analysis result. + /// \param Body [out] The body code nested in \p NumLoops loop. + /// \param OriginalInits [out] Collection of statements and declarations that + /// must have been executed/declared before entering the + /// loop. + /// + /// \return Whether there was any error. + bool checkTransformableLoopNest( + OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops, + SmallVectorImpl &LoopHelpers, + Stmt *&Body, + SmallVectorImpl, 0>> + &OriginalInits); + + /// Helper to keep information about the current `omp begin/end declare + /// variant` nesting. + struct OMPDeclareVariantScope { + /// The associated OpenMP context selector. + OMPTraitInfo *TI; + + /// The associated OpenMP context selector mangling. + std::string NameSuffix; + + OMPDeclareVariantScope(OMPTraitInfo &TI); + }; + + /// Return the OMPTraitInfo for the surrounding scope, if any. + OMPTraitInfo *getOMPTraitInfoForSurroundingScope() { + return OMPDeclareVariantScopes.empty() ? nullptr + : OMPDeclareVariantScopes.back().TI; + } + + /// The current `omp begin/end declare variant` scopes. + SmallVector OMPDeclareVariantScopes; + + /// The current `omp begin/end assumes` scopes. + SmallVector OMPAssumeScoped; + + /// All `omp assumes` we encountered so far. + SmallVector OMPAssumeGlobal; + + /// OMPD_loop is mapped to OMPD_for, OMPD_distribute or OMPD_simd depending + /// on the parameter of the bind clause. In the methods for the + /// mapped directives, check the parameters of the lastprivate clause. + bool checkLastPrivateForMappedDirectives(ArrayRef Clauses); + /// Depending on the bind clause of OMPD_loop map the directive to new + /// directives. + /// 1) loop bind(parallel) --> OMPD_for + /// 2) loop bind(teams) --> OMPD_distribute + /// 3) loop bind(thread) --> OMPD_simd + /// This is being handled in Sema instead of Codegen because of the need for + /// rigorous semantic checking in the new mapped directives. + bool mapLoopConstruct(llvm::SmallVector &ClausesWithoutBind, + ArrayRef Clauses, + OpenMPBindClauseKind &BindKind, + OpenMPDirectiveKind &Kind, + OpenMPDirectiveKind &PrevMappedDirective, + SourceLocation StartLoc, SourceLocation EndLoc, + const DeclarationNameInfo &DirName, + OpenMPDirectiveKind CancelRegion); +}; + +} // namespace clang + +#endif // LLVM_CLANG_SEMA_SEMAOPENMP_H diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 2b934234b7cf5d..c881b37507771a 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -28,6 +28,7 @@ #include "clang/Sema/Scope.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaDiagnostic.h" +#include "clang/Sema/SemaOpenMP.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" @@ -2383,7 +2384,7 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS, } if (getLangOpts().OpenMP) - Actions.startOpenMPCXXRangeFor(); + Actions.OpenMP().startOpenMPCXXRangeFor(); if (Tok.is(tok::l_brace)) FRI->RangeExpr = ParseBraceInitializer(); else diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 473ec9afd60181..32d96f81c4c8de 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -31,6 +31,7 @@ #include "clang/Sema/ParsedTemplate.h" #include "clang/Sema/Scope.h" #include "clang/Sema/SemaCUDA.h" +#include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/SemaSYCL.h" #include "clang/Sema/TypoCorrection.h" #include "llvm/ADT/SmallVector.h" @@ -2075,7 +2076,7 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) { // replace this call to ActOnOpenACCArraySectionExpr in the future. // Eventually we'll genericize the OPenMPArraySectionExpr type as // well. - LHS = Actions.ActOnOMPArraySectionExpr( + LHS = Actions.OpenMP().ActOnOMPArraySectionExpr( LHS.get(), Loc, ArgExprs.empty() ? nullptr : ArgExprs[0], ColonLocFirst, ColonLocSecond, Length.get(), Stride.get(), RLoc); } else { @@ -3277,7 +3278,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr, if (ErrorFound) { Result = ExprError(); } else if (!Result.isInvalid()) { - Result = Actions.ActOnOMPArrayShapingExpr( + Result = Actions.OpenMP().ActOnOMPArrayShapingExpr( Result.get(), OpenLoc, RParenLoc, OMPDimensions, OMPBracketsRanges); } return Result; diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 814126e321d3bc..480201bc06f613 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -21,6 +21,7 @@ #include "clang/Parse/RAIIObjectsForParser.h" #include "clang/Sema/EnterExpressionEvaluationContext.h" #include "clang/Sema/Scope.h" +#include "clang/Sema/SemaOpenMP.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/UniqueVector.h" @@ -87,7 +88,7 @@ class DeclDirectiveListParserHelper final { DeclDirectiveListParserHelper(Parser *P, OpenMPDirectiveKind Kind) : P(P), Kind(Kind) {} void operator()(CXXScopeSpec &SS, DeclarationNameInfo NameInfo) { - ExprResult Res = P->getActions().ActOnOpenMPIdExpression( + ExprResult Res = P->getActions().OpenMP().ActOnOpenMPIdExpression( P->getCurScope(), SS, NameInfo, Kind); if (Res.isUsable()) Identifiers.push_back(Res.get()); @@ -322,8 +323,8 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) { SourceRange Range; TypeResult TR = ParseTypeName(&Range, DeclaratorContext::Prototype, AS); if (TR.isUsable()) { - QualType ReductionType = - Actions.ActOnOpenMPDeclareReductionType(Range.getBegin(), TR); + QualType ReductionType = Actions.OpenMP().ActOnOpenMPDeclareReductionType( + Range.getBegin(), TR); if (!ReductionType.isNull()) { ReductionTypes.push_back( std::make_pair(ReductionType, Range.getBegin())); @@ -363,8 +364,10 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) { return DeclGroupPtrTy(); } - DeclGroupPtrTy DRD = Actions.ActOnOpenMPDeclareReductionDirectiveStart( - getCurScope(), Actions.getCurLexicalContext(), Name, ReductionTypes, AS); + DeclGroupPtrTy DRD = + Actions.OpenMP().ActOnOpenMPDeclareReductionDirectiveStart( + getCurScope(), Actions.getCurLexicalContext(), Name, ReductionTypes, + AS); // Parse expression and then parse initializer if any for each // correct type. @@ -375,10 +378,11 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) { Scope::CompoundStmtScope | Scope::OpenMPDirectiveScope); // Parse expression. - Actions.ActOnOpenMPDeclareReductionCombinerStart(getCurScope(), D); + Actions.OpenMP().ActOnOpenMPDeclareReductionCombinerStart(getCurScope(), D); ExprResult CombinerResult = Actions.ActOnFinishFullExpr( ParseExpression().get(), D->getLocation(), /*DiscardedValue*/ false); - Actions.ActOnOpenMPDeclareReductionCombinerEnd(D, CombinerResult.get()); + Actions.OpenMP().ActOnOpenMPDeclareReductionCombinerEnd( + D, CombinerResult.get()); if (CombinerResult.isInvalid() && Tok.isNot(tok::r_paren) && Tok.isNot(tok::annot_pragma_openmp_end)) { @@ -411,8 +415,8 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) { Scope::OpenMPDirectiveScope); // Parse expression. VarDecl *OmpPrivParm = - Actions.ActOnOpenMPDeclareReductionInitializerStart(getCurScope(), - D); + Actions.OpenMP().ActOnOpenMPDeclareReductionInitializerStart( + getCurScope(), D); // Check if initializer is omp_priv or something else. if (Tok.is(tok::identifier) && Tok.getIdentifierInfo()->isStr("omp_priv")) { @@ -423,7 +427,7 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) { ParseAssignmentExpression().get(), D->getLocation(), /*DiscardedValue*/ false); } - Actions.ActOnOpenMPDeclareReductionInitializerEnd( + Actions.OpenMP().ActOnOpenMPDeclareReductionInitializerEnd( D, InitializerResult.get(), OmpPrivParm); if (InitializerResult.isInvalid() && Tok.isNot(tok::r_paren) && Tok.isNot(tok::annot_pragma_openmp_end)) { @@ -444,8 +448,8 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) { else TPA.Commit(); } - return Actions.ActOnOpenMPDeclareReductionDirectiveEnd(getCurScope(), DRD, - IsCorrect); + return Actions.OpenMP().ActOnOpenMPDeclareReductionDirectiveEnd( + getCurScope(), DRD, IsCorrect); } void Parser::ParseOpenMPReductionInitializerForDecl(VarDecl *OmpPrivParm) { @@ -569,8 +573,8 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) { SourceRange Range; TypeResult ParsedType = parseOpenMPDeclareMapperVarDecl(Range, VName, AS); if (ParsedType.isUsable()) - MapperType = - Actions.ActOnOpenMPDeclareMapperType(Range.getBegin(), ParsedType); + MapperType = Actions.OpenMP().ActOnOpenMPDeclareMapperType(Range.getBegin(), + ParsedType); if (MapperType.isNull()) IsCorrect = false; if (!IsCorrect) { @@ -591,11 +595,13 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) { unsigned ScopeFlags = Scope::FnScope | Scope::DeclScope | Scope::CompoundStmtScope | Scope::OpenMPDirectiveScope; ParseScope OMPDirectiveScope(this, ScopeFlags); - Actions.StartOpenMPDSABlock(OMPD_declare_mapper, DirName, getCurScope(), Loc); + Actions.OpenMP().StartOpenMPDSABlock(OMPD_declare_mapper, DirName, + getCurScope(), Loc); // Add the mapper variable declaration. - ExprResult MapperVarRef = Actions.ActOnOpenMPDeclareMapperDirectiveVarDecl( - getCurScope(), MapperType, Range.getBegin(), VName); + ExprResult MapperVarRef = + Actions.OpenMP().ActOnOpenMPDeclareMapperDirectiveVarDecl( + getCurScope(), MapperType, Range.getBegin(), VName); // Parse map clauses. SmallVector Clauses; @@ -603,7 +609,7 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) { OpenMPClauseKind CKind = Tok.isAnnotation() ? OMPC_unknown : getOpenMPClauseKind(PP.getSpelling(Tok)); - Actions.StartOpenMPClause(CKind); + Actions.OpenMP().StartOpenMPClause(CKind); OMPClause *Clause = ParseOpenMPClause(OMPD_declare_mapper, CKind, Clauses.empty()); if (Clause) @@ -613,7 +619,7 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) { // Skip ',' if any. if (Tok.is(tok::comma)) ConsumeToken(); - Actions.EndOpenMPClause(); + Actions.OpenMP().EndOpenMPClause(); } if (Clauses.empty()) { Diag(Tok, diag::err_omp_expected_clause) @@ -622,9 +628,9 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) { } // Exit scope. - Actions.EndOpenMPDSABlock(nullptr); + Actions.OpenMP().EndOpenMPDSABlock(nullptr); OMPDirectiveScope.Exit(); - DeclGroupPtrTy DG = Actions.ActOnOpenMPDeclareMapperDirective( + DeclGroupPtrTy DG = Actions.OpenMP().ActOnOpenMPDeclareMapperDirective( getCurScope(), Actions.getCurLexicalContext(), MapperId, MapperType, Range.getBegin(), VName, AS, MapperVarRef.get(), Clauses); if (!IsCorrect) @@ -652,7 +658,8 @@ TypeResult Parser::parseOpenMPDeclareMapperVarDecl(SourceRange &Range, } Name = Actions.GetNameForDeclarator(DeclaratorInfo).getName(); - return Actions.ActOnOpenMPDeclareMapperVarDecl(getCurScope(), DeclaratorInfo); + return Actions.OpenMP().ActOnOpenMPDeclareMapperVarDecl(getCurScope(), + DeclaratorInfo); } namespace { @@ -748,7 +755,7 @@ static bool parseDeclareSimdClauses( OpenMPClauseKind CKind = getOpenMPClauseKind(ClauseName); if (CKind == OMPC_uniform || CKind == OMPC_aligned || CKind == OMPC_linear) { - Sema::OpenMPVarListDataTy Data; + SemaOpenMP::OpenMPVarListDataTy Data; SmallVectorImpl *Vars = &Uniforms; if (CKind == OMPC_aligned) { Vars = &Aligneds; @@ -768,7 +775,7 @@ static bool parseDeclareSimdClauses( assert(0 <= Data.ExtraModifier && Data.ExtraModifier <= OMPC_LINEAR_unknown && "Unexpected linear modifier."); - if (P.getActions().CheckOpenMPLinearModifier( + if (P.getActions().OpenMP().CheckOpenMPLinearModifier( static_cast(Data.ExtraModifier), Data.ExtraModifierLoc)) Data.ExtraModifier = OMPC_LINEAR_val; @@ -816,7 +823,7 @@ Parser::ParseOMPDeclareSimdClauses(Parser::DeclGroupPtrTy Ptr, SourceLocation EndLoc = ConsumeAnnotationToken(); if (IsError) return Ptr; - return Actions.ActOnOpenMPDeclareSimdDirective( + return Actions.OpenMP().ActOnOpenMPDeclareSimdDirective( Ptr, BS, Simdlen.get(), Uniforms, Aligneds, Alignments, Linears, LinModifiers, Steps, SourceRange(Loc, EndLoc)); } @@ -1412,7 +1419,8 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, return; } - OMPTraitInfo *ParentTI = Actions.getOMPTraitInfoForSurroundingScope(); + OMPTraitInfo *ParentTI = + Actions.OpenMP().getOMPTraitInfoForSurroundingScope(); ASTContext &ASTCtx = Actions.getASTContext(); OMPTraitInfo &TI = ASTCtx.getNewOMPTraitInfo(); SmallVector AdjustNothing; @@ -1445,7 +1453,7 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, case OMPC_adjust_args: { AdjustArgsLoc = Tok.getLocation(); ConsumeToken(); - Sema::OpenMPVarListDataTy Data; + SemaOpenMP::OpenMPVarListDataTy Data; SmallVector Vars; IsError = ParseOpenMPVarList(OMPD_declare_variant, OMPC_adjust_args, Vars, Data); @@ -1486,12 +1494,12 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, } std::optional> DeclVarData = - Actions.checkOpenMPDeclareVariantFunction( + Actions.OpenMP().checkOpenMPDeclareVariantFunction( Ptr, AssociatedFunction.get(), TI, AppendArgs.size(), SourceRange(Loc, Tok.getLocation())); if (DeclVarData && !TI.Sets.empty()) - Actions.ActOnOpenMPDeclareVariantDirective( + Actions.OpenMP().ActOnOpenMPDeclareVariantDirective( DeclVarData->first, DeclVarData->second, TI, AdjustNothing, AdjustNeedDevicePtr, AppendArgs, AdjustArgsLoc, AppendArgsLoc, SourceRange(Loc, Tok.getLocation())); @@ -1642,7 +1650,7 @@ void Parser::ParseOpenMPClauses(OpenMPDirectiveKind DKind, OpenMPClauseKind CKind = Tok.isAnnotation() ? OMPC_unknown : getOpenMPClauseKind(PP.getSpelling(Tok)); - Actions.StartOpenMPClause(CKind); + Actions.OpenMP().StartOpenMPClause(CKind); OMPClause *Clause = ParseOpenMPClause( DKind, CKind, !FirstClauses[unsigned(CKind)].getInt()); SkipUntil(tok::comma, tok::identifier, tok::annot_pragma_openmp_end, @@ -1651,13 +1659,13 @@ void Parser::ParseOpenMPClauses(OpenMPDirectiveKind DKind, if (Clause != nullptr) Clauses.push_back(Clause); if (Tok.is(tok::annot_pragma_openmp_end)) { - Actions.EndOpenMPClause(); + Actions.OpenMP().EndOpenMPClause(); break; } // Skip ',' if any. if (Tok.is(tok::comma)) ConsumeToken(); - Actions.EndOpenMPClause(); + Actions.OpenMP().EndOpenMPClause(); } } @@ -1750,12 +1758,13 @@ void Parser::ParseOpenMPAssumesDirective(OpenMPDirectiveKind DKind, Assumptions.push_back(Assumption); } - Actions.ActOnOpenMPAssumesDirective(Loc, DKind, Assumptions, SkippedClauses); + Actions.OpenMP().ActOnOpenMPAssumesDirective(Loc, DKind, Assumptions, + SkippedClauses); } void Parser::ParseOpenMPEndAssumesDirective(SourceLocation Loc) { - if (Actions.isInOpenMPAssumeScope()) - Actions.ActOnOpenMPEndAssumesDirective(); + if (Actions.OpenMP().isInOpenMPAssumeScope()) + Actions.OpenMP().ActOnOpenMPEndAssumesDirective(); else Diag(Loc, diag::err_expected_begin_assumes); } @@ -1811,7 +1820,7 @@ parseOpenMPSimpleClause(Parser &P, OpenMPClauseKind Kind) { } void Parser::ParseOMPDeclareTargetClauses( - Sema::DeclareTargetContextInfo &DTCI) { + SemaOpenMP::DeclareTargetContextInfo &DTCI) { SourceLocation DeviceTypeLoc; bool RequiresToOrLinkOrIndirectClause = false; bool HasToOrLinkOrIndirectClause = false; @@ -1910,11 +1919,11 @@ void Parser::ParseOMPDeclareTargetClauses( if (DTCI.Kind == OMPD_declare_target || HasIdentifier) { auto &&Callback = [this, MT, &DTCI](CXXScopeSpec &SS, DeclarationNameInfo NameInfo) { - NamedDecl *ND = - Actions.lookupOpenMPDeclareTargetName(getCurScope(), SS, NameInfo); + NamedDecl *ND = Actions.OpenMP().lookupOpenMPDeclareTargetName( + getCurScope(), SS, NameInfo); if (!ND) return; - Sema::DeclareTargetContextInfo::MapInfo MI{MT, NameInfo.getLoc()}; + SemaOpenMP::DeclareTargetContextInfo::MapInfo MI{MT, NameInfo.getLoc()}; bool FirstMapping = DTCI.ExplicitlyMapped.try_emplace(ND, MI).second; if (!FirstMapping) Diag(NameInfo.getLoc(), diag::err_omp_declare_target_multiple) @@ -2090,8 +2099,8 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( skipUntilPragmaOpenMPEnd(DKind); // Skip the last annot_pragma_openmp_end. ConsumeAnnotationToken(); - return Actions.ActOnOpenMPThreadprivateDirective(Loc, - Helper.getIdentifiers()); + return Actions.OpenMP().ActOnOpenMPThreadprivateDirective( + Loc, Helper.getIdentifiers()); } break; } @@ -2109,7 +2118,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( OpenMPClauseKind CKind = Tok.isAnnotation() ? OMPC_unknown : getOpenMPClauseKind(PP.getSpelling(Tok)); - Actions.StartOpenMPClause(CKind); + Actions.OpenMP().StartOpenMPClause(CKind); OMPClause *Clause = ParseOpenMPClause( OMPD_allocate, CKind, !FirstClauses[unsigned(CKind)].getInt()); SkipUntil(tok::comma, tok::identifier, tok::annot_pragma_openmp_end, @@ -2118,20 +2127,20 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( if (Clause != nullptr) Clauses.push_back(Clause); if (Tok.is(tok::annot_pragma_openmp_end)) { - Actions.EndOpenMPClause(); + Actions.OpenMP().EndOpenMPClause(); break; } // Skip ',' if any. if (Tok.is(tok::comma)) ConsumeToken(); - Actions.EndOpenMPClause(); + Actions.OpenMP().EndOpenMPClause(); } skipUntilPragmaOpenMPEnd(DKind); } // Skip the last annot_pragma_openmp_end. ConsumeAnnotationToken(); - return Actions.ActOnOpenMPAllocateDirective(Loc, Helper.getIdentifiers(), - Clauses); + return Actions.OpenMP().ActOnOpenMPAllocateDirective( + Loc, Helper.getIdentifiers(), Clauses); } break; } @@ -2150,7 +2159,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( OpenMPClauseKind CKind = Tok.isAnnotation() ? OMPC_unknown : getOpenMPClauseKind(PP.getSpelling(Tok)); - Actions.StartOpenMPClause(CKind); + Actions.OpenMP().StartOpenMPClause(CKind); OMPClause *Clause = ParseOpenMPClause( OMPD_requires, CKind, !FirstClauses[unsigned(CKind)].getInt()); SkipUntil(tok::comma, tok::identifier, tok::annot_pragma_openmp_end, @@ -2159,13 +2168,13 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( if (Clause != nullptr) Clauses.push_back(Clause); if (Tok.is(tok::annot_pragma_openmp_end)) { - Actions.EndOpenMPClause(); + Actions.OpenMP().EndOpenMPClause(); break; } // Skip ',' if any. if (Tok.is(tok::comma)) ConsumeToken(); - Actions.EndOpenMPClause(); + Actions.OpenMP().EndOpenMPClause(); } // Consume final annot_pragma_openmp_end if (Clauses.empty()) { @@ -2175,14 +2184,15 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( return nullptr; } ConsumeAnnotationToken(); - return Actions.ActOnOpenMPRequiresDirective(StartLoc, Clauses); + return Actions.OpenMP().ActOnOpenMPRequiresDirective(StartLoc, Clauses); } case OMPD_error: { SmallVector Clauses; SourceLocation StartLoc = ConsumeToken(); ParseOpenMPClauses(DKind, Clauses, StartLoc); - Actions.ActOnOpenMPErrorDirective(Clauses, StartLoc, SourceLocation(), - /*InExContext = */ false); + Actions.OpenMP().ActOnOpenMPErrorDirective(Clauses, StartLoc, + SourceLocation(), + /*InExContext = */ false); break; } case OMPD_assumes: @@ -2217,7 +2227,8 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( // { #pragma omp end declare variant } // ConsumeToken(); - OMPTraitInfo *ParentTI = Actions.getOMPTraitInfoForSurroundingScope(); + OMPTraitInfo *ParentTI = + Actions.OpenMP().getOMPTraitInfoForSurroundingScope(); ASTContext &ASTCtx = Actions.getASTContext(); OMPTraitInfo &TI = ASTCtx.getNewOMPTraitInfo(); if (parseOMPDeclareVariantMatchClause(Loc, TI, ParentTI)) { @@ -2248,7 +2259,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( /* ConstructTraits */ ArrayRef()); if (isVariantApplicableInContext(VMI, OMPCtx, /* DeviceSetOnly */ true)) { - Actions.ActOnOpenMPBeginDeclareVariant(Loc, TI); + Actions.OpenMP().ActOnOpenMPBeginDeclareVariant(Loc, TI); break; } @@ -2275,8 +2286,8 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( break; } case OMPD_end_declare_variant: { - if (Actions.isInOpenMPDeclareVariantScope()) - Actions.ActOnOpenMPEndDeclareVariant(); + if (Actions.OpenMP().isInOpenMPDeclareVariantScope()) + Actions.OpenMP().ActOnOpenMPEndDeclareVariant(); else Diag(Loc, diag::err_expected_begin_declare_variant); ConsumeToken(); @@ -2331,7 +2342,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( case OMPD_declare_target: { SourceLocation DTLoc = ConsumeAnyToken(); bool HasClauses = Tok.isNot(tok::annot_pragma_openmp_end); - Sema::DeclareTargetContextInfo DTCI(DKind, DTLoc); + SemaOpenMP::DeclareTargetContextInfo DTCI(DKind, DTLoc); if (HasClauses) ParseOMPDeclareTargetClauses(DTCI); bool HasImplicitMappings = DKind == OMPD_begin_declare_target || @@ -2342,24 +2353,24 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( ConsumeAnyToken(); if (HasImplicitMappings) { - Actions.ActOnStartOpenMPDeclareTargetContext(DTCI); + Actions.OpenMP().ActOnStartOpenMPDeclareTargetContext(DTCI); return nullptr; } - Actions.ActOnFinishedOpenMPDeclareTargetContext(DTCI); + Actions.OpenMP().ActOnFinishedOpenMPDeclareTargetContext(DTCI); llvm::SmallVector Decls; for (auto &It : DTCI.ExplicitlyMapped) Decls.push_back(It.first); return Actions.BuildDeclaratorGroup(Decls); } case OMPD_end_declare_target: { - if (!Actions.isInOpenMPDeclareTargetContext()) { + if (!Actions.OpenMP().isInOpenMPDeclareTargetContext()) { Diag(Tok, diag::err_omp_unexpected_directive) << 1 << getOpenMPDirectiveName(DKind); break; } - const Sema::DeclareTargetContextInfo &DTCI = - Actions.ActOnOpenMPEndDeclareTargetDirective(); + const SemaOpenMP::DeclareTargetContextInfo &DTCI = + Actions.OpenMP().ActOnOpenMPEndDeclareTargetDirective(); ParseOMPEndDeclareTargetDirective(DTCI.Kind, DKind, DTCI.Loc); return nullptr; } @@ -2683,7 +2694,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( if (!ParseOpenMPSimpleVarList(DKind, Helper, /*AllowScopeSpecifier=*/false)) { skipUntilPragmaOpenMPEnd(DKind); - DeclGroupPtrTy Res = Actions.ActOnOpenMPThreadprivateDirective( + DeclGroupPtrTy Res = Actions.OpenMP().ActOnOpenMPThreadprivateDirective( Loc, Helper.getIdentifiers()); Directive = Actions.ActOnDeclStmt(Res, Loc, Tok.getLocation()); } @@ -2710,7 +2721,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( OpenMPClauseKind CKind = Tok.isAnnotation() ? OMPC_unknown : getOpenMPClauseKind(PP.getSpelling(Tok)); - Actions.StartOpenMPClause(CKind); + Actions.OpenMP().StartOpenMPClause(CKind); OMPClause *Clause = ParseOpenMPClause( OMPD_allocate, CKind, !FirstClauses[unsigned(CKind)].getInt()); SkipUntil(tok::comma, tok::identifier, tok::annot_pragma_openmp_end, @@ -2719,17 +2730,17 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( if (Clause != nullptr) Clauses.push_back(Clause); if (Tok.is(tok::annot_pragma_openmp_end)) { - Actions.EndOpenMPClause(); + Actions.OpenMP().EndOpenMPClause(); break; } // Skip ',' if any. if (Tok.is(tok::comma)) ConsumeToken(); - Actions.EndOpenMPClause(); + Actions.OpenMP().EndOpenMPClause(); } skipUntilPragmaOpenMPEnd(DKind); } - DeclGroupPtrTy Res = Actions.ActOnOpenMPAllocateDirective( + DeclGroupPtrTy Res = Actions.OpenMP().ActOnOpenMPAllocateDirective( Loc, Helper.getIdentifiers(), Clauses); Directive = Actions.ActOnDeclStmt(Res, Loc, Tok.getLocation()); } @@ -2875,7 +2886,8 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( if (isOpenMPSimdDirective(DKind)) ScopeFlags |= Scope::OpenMPSimdDirectiveScope; ParseScope OMPDirectiveScope(this, ScopeFlags); - Actions.StartOpenMPDSABlock(DKind, DirName, Actions.getCurScope(), Loc); + Actions.OpenMP().StartOpenMPDSABlock(DKind, DirName, Actions.getCurScope(), + Loc); while (Tok.isNot(tok::annot_pragma_openmp_end)) { // If we are parsing for a directive within a metadirective, the directive @@ -2909,7 +2921,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( } // No more implicit clauses allowed. ImplicitClauseAllowed = false; - Actions.StartOpenMPClause(CKind); + Actions.OpenMP().StartOpenMPClause(CKind); HasImplicitClause = false; OMPClause *Clause = ParseOpenMPClause( DKind, CKind, !FirstClauses[unsigned(CKind)].getInt()); @@ -2922,7 +2934,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( // Skip ',' if any. if (Tok.is(tok::comma)) ConsumeToken(); - Actions.EndOpenMPClause(); + Actions.OpenMP().EndOpenMPClause(); } // End location of the directive. EndLoc = Tok.getLocation(); @@ -2953,7 +2965,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( StmtResult AssociatedStmt; if (HasAssociatedStatement) { // The body is a block scope like in Lambdas and Blocks. - Actions.ActOnOpenMPRegionStart(DKind, getCurScope()); + Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope()); // FIXME: We create a bogus CompoundStmt scope to hold the contents of // the captured region. Code elsewhere assumes that any FunctionScopeInfo // should have at least one compound statement scope within it. @@ -2964,30 +2976,33 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) && getLangOpts().OpenMPIRBuilder) - AssociatedStmt = Actions.ActOnOpenMPLoopnest(AssociatedStmt.get()); + AssociatedStmt = + Actions.OpenMP().ActOnOpenMPLoopnest(AssociatedStmt.get()); } - AssociatedStmt = Actions.ActOnOpenMPRegionEnd(AssociatedStmt, Clauses); + AssociatedStmt = + Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses); } else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data || DKind == OMPD_target_exit_data) { - Actions.ActOnOpenMPRegionStart(DKind, getCurScope()); + Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope()); AssociatedStmt = (Sema::CompoundScopeRAII(Actions), Actions.ActOnCompoundStmt(Loc, Loc, std::nullopt, /*isStmtExpr=*/false)); - AssociatedStmt = Actions.ActOnOpenMPRegionEnd(AssociatedStmt, Clauses); + AssociatedStmt = + Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses); } - Directive = Actions.ActOnOpenMPExecutableDirective( + Directive = Actions.OpenMP().ActOnOpenMPExecutableDirective( DKind, DirName, CancelRegion, Clauses, AssociatedStmt.get(), Loc, EndLoc); // Exit scope. - Actions.EndOpenMPDSABlock(Directive.get()); + Actions.OpenMP().EndOpenMPDSABlock(Directive.get()); OMPDirectiveScope.Exit(); break; } case OMPD_declare_target: { SourceLocation DTLoc = ConsumeAnyToken(); bool HasClauses = Tok.isNot(tok::annot_pragma_openmp_end); - Sema::DeclareTargetContextInfo DTCI(DKind, DTLoc); + SemaOpenMP::DeclareTargetContextInfo DTCI(DKind, DTLoc); if (HasClauses) ParseOMPDeclareTargetClauses(DTCI); bool HasImplicitMappings = @@ -3003,7 +3018,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective( // Skip the last annot_pragma_openmp_end. ConsumeAnyToken(); - Actions.ActOnFinishedOpenMPDeclareTargetContext(DTCI); + Actions.OpenMP().ActOnFinishedOpenMPDeclareTargetContext(DTCI); break; } case OMPD_declare_simd: @@ -3118,7 +3133,7 @@ OMPClause *Parser::ParseOpenMPSizesClause() { T.consumeClose(); - return Actions.ActOnOpenMPSizesClause( + return Actions.OpenMP().ActOnOpenMPSizesClause( ValExprs, ClauseNameLoc, T.getOpenLocation(), T.getCloseLocation()); } @@ -3130,7 +3145,7 @@ OMPClause *Parser::ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind) { BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end); if (T.expectAndConsume(diag::err_expected_lparen_after, "uses_allocator")) return nullptr; - SmallVector Data; + SmallVector Data; do { CXXScopeSpec SS; Token Replacement; @@ -3144,7 +3159,7 @@ OMPClause *Parser::ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind) { StopBeforeMatch); break; } - Sema::UsesAllocatorsData &D = Data.emplace_back(); + SemaOpenMP::UsesAllocatorsData &D = Data.emplace_back(); D.Allocator = Allocator.get(); if (Tok.is(tok::l_paren)) { BalancedDelimiterTracker T(*this, tok::l_paren, @@ -3169,8 +3184,8 @@ OMPClause *Parser::ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind) { ConsumeAnyToken(); } while (Tok.isNot(tok::r_paren) && Tok.isNot(tok::annot_pragma_openmp_end)); T.consumeClose(); - return Actions.ActOnOpenMPUsesAllocatorClause(Loc, T.getOpenLocation(), - T.getCloseLocation(), Data); + return Actions.OpenMP().ActOnOpenMPUsesAllocatorClause( + Loc, T.getOpenLocation(), T.getCloseLocation(), Data); } /// Parsing of OpenMP clauses. @@ -3538,15 +3553,16 @@ OMPClause *Parser::ParseOpenMPSingleExprClause(OpenMPClauseKind Kind, if (ParseOnly) return nullptr; - return Actions.ActOnOpenMPSingleExprClause(Kind, Val.get(), Loc, LLoc, RLoc); + return Actions.OpenMP().ActOnOpenMPSingleExprClause(Kind, Val.get(), Loc, + LLoc, RLoc); } /// Parse indirect clause for '#pragma omp declare target' directive. /// 'indirect' '[' '(' invoked-by-fptr ')' ']' /// where invoked-by-fptr is a constant boolean expression that evaluates to /// true or false at compile time. -bool Parser::ParseOpenMPIndirectClause(Sema::DeclareTargetContextInfo &DTCI, - bool ParseOnly) { +bool Parser::ParseOpenMPIndirectClause( + SemaOpenMP::DeclareTargetContextInfo &DTCI, bool ParseOnly) { SourceLocation Loc = ConsumeToken(); SourceLocation RLoc; @@ -3721,15 +3737,16 @@ OMPClause *Parser::ParseOpenMPInteropClause(OpenMPClauseKind Kind, return nullptr; if (Kind == OMPC_init) - return Actions.ActOnOpenMPInitClause(InteropVarExpr.get(), InteropInfo, Loc, - T.getOpenLocation(), VarLoc, RLoc); + return Actions.OpenMP().ActOnOpenMPInitClause( + InteropVarExpr.get(), InteropInfo, Loc, T.getOpenLocation(), VarLoc, + RLoc); if (Kind == OMPC_use) - return Actions.ActOnOpenMPUseClause(InteropVarExpr.get(), Loc, - T.getOpenLocation(), VarLoc, RLoc); + return Actions.OpenMP().ActOnOpenMPUseClause( + InteropVarExpr.get(), Loc, T.getOpenLocation(), VarLoc, RLoc); if (Kind == OMPC_destroy) - return Actions.ActOnOpenMPDestroyClause(InteropVarExpr.get(), Loc, - T.getOpenLocation(), VarLoc, RLoc); + return Actions.OpenMP().ActOnOpenMPDestroyClause( + InteropVarExpr.get(), Loc, T.getOpenLocation(), VarLoc, RLoc); llvm_unreachable("Unexpected interop variable clause."); } @@ -3787,8 +3804,8 @@ OMPClause *Parser::ParseOpenMPOMPXAttributesClause(bool ParseOnly) { }; } - return Actions.ActOnOpenMPXAttributeClause(Attrs, Loc, T.getOpenLocation(), - T.getCloseLocation()); + return Actions.OpenMP().ActOnOpenMPXAttributeClause( + Attrs, Loc, T.getOpenLocation(), T.getCloseLocation()); } /// Parsing of simple OpenMP clauses like 'default' or 'proc_bind'. @@ -3823,9 +3840,8 @@ OMPClause *Parser::ParseOpenMPSimpleClause(OpenMPClauseKind Kind, << getOpenMPClauseName(OMPC_default) << "5.1"; return nullptr; } - return Actions.ActOnOpenMPSimpleClause(Kind, Val->Type, - Val->TypeLoc, Val->LOpen, - Val->Loc, Val->RLoc); + return Actions.OpenMP().ActOnOpenMPSimpleClause( + Kind, Val->Type, Val->TypeLoc, Val->LOpen, Val->Loc, Val->RLoc); } /// Parsing of OpenMP clauses like 'ordered'. @@ -3860,7 +3876,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPClauseKind Kind, bool ParseOnly) { if (ParseOnly) return nullptr; - return Actions.ActOnOpenMPClause(Kind, Loc, Tok.getLocation()); + return Actions.OpenMP().ActOnOpenMPClause(Kind, Loc, Tok.getLocation()); } /// Parsing of OpenMP clauses with single expressions and some additional @@ -4118,7 +4134,7 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind, if (ParseOnly) return nullptr; - return Actions.ActOnOpenMPSingleExprWithArgClause( + return Actions.OpenMP().ActOnOpenMPSingleExprWithArgClause( Kind, Arg, Val.get(), Loc, T.getOpenLocation(), KLoc, DelimLoc, RLoc); } @@ -4184,7 +4200,7 @@ static OpenMPMapModifierKind isMapModifier(Parser &P) { } /// Parse the mapper modifier in map, to, and from clauses. -bool Parser::parseMapperModifier(Sema::OpenMPVarListDataTy &Data) { +bool Parser::parseMapperModifier(SemaOpenMP::OpenMPVarListDataTy &Data) { // Parse '('. BalancedDelimiterTracker T(*this, tok::l_paren, tok::colon); if (T.expectAndConsume(diag::err_expected_lparen_after, "mapper")) { @@ -4216,7 +4232,7 @@ bool Parser::parseMapperModifier(Sema::OpenMPVarListDataTy &Data) { /// map([ [map-type-modifier[,] [map-type-modifier[,] ...] map-type : ] list) /// where, map-type-modifier ::= always | close | mapper(mapper-identifier) | /// present -bool Parser::parseMapTypeModifiers(Sema::OpenMPVarListDataTy &Data) { +bool Parser::parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data) { while (getCurToken().isNot(tok::colon)) { OpenMPMapModifierKind TypeModifier = isMapModifier(*this); if (TypeModifier == OMPC_MAP_MODIFIER_always || @@ -4282,7 +4298,7 @@ static OpenMPMapClauseKind isMapType(Parser &P) { /// Parse map-type in map clause. /// map([ [map-type-modifier[,] [map-type-modifier[,] ...] map-type : ] list) /// where, map-type ::= to | from | tofrom | alloc | release | delete -static void parseMapType(Parser &P, Sema::OpenMPVarListDataTy &Data) { +static void parseMapType(Parser &P, SemaOpenMP::OpenMPVarListDataTy &Data) { Token Tok = P.getCurToken(); if (Tok.is(tok::colon)) { P.Diag(Tok, diag::err_omp_map_type_missing); @@ -4306,7 +4322,7 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() { return ExprError(); SourceLocation LLoc = T.getOpenLocation(); - SmallVector Data; + SmallVector Data; while (Tok.isNot(tok::r_paren) && Tok.isNot(tok::annot_pragma_openmp_end)) { // Check if the type parsing is required. ParsedType IteratorType; @@ -4380,7 +4396,7 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() { if (Tok.is(tok::comma)) ConsumeToken(); - Sema::OMPIteratorData &D = Data.emplace_back(); + SemaOpenMP::OMPIteratorData &D = Data.emplace_back(); D.DeclIdent = II; D.DeclIdentLoc = IdLoc; D.Type = IteratorType; @@ -4397,12 +4413,12 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() { if (!T.consumeClose()) RLoc = T.getCloseLocation(); - return Actions.ActOnOMPIteratorExpr(getCurScope(), IteratorKwLoc, LLoc, RLoc, - Data); + return Actions.OpenMP().ActOnOMPIteratorExpr(getCurScope(), IteratorKwLoc, + LLoc, RLoc, Data); } bool Parser::ParseOpenMPReservedLocator(OpenMPClauseKind Kind, - Sema::OpenMPVarListDataTy &Data, + SemaOpenMP::OpenMPVarListDataTy &Data, const LangOptions &LangOpts) { // Currently the only reserved locator is 'omp_all_memory' which is only // allowed on a depend clause. @@ -4430,7 +4446,7 @@ bool Parser::ParseOpenMPReservedLocator(OpenMPClauseKind Kind, /// Parse step size expression. Returns true if parsing is successfull, /// otherwise returns false. -static bool parseStepSize(Parser &P, Sema::OpenMPVarListDataTy &Data, +static bool parseStepSize(Parser &P, SemaOpenMP::OpenMPVarListDataTy &Data, OpenMPClauseKind CKind, SourceLocation ELoc) { ExprResult Tail = P.ParseAssignmentExpression(); Sema &Actions = P.getActions(); @@ -4451,7 +4467,7 @@ static bool parseStepSize(Parser &P, Sema::OpenMPVarListDataTy &Data, bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind, SmallVectorImpl &Vars, - Sema::OpenMPVarListDataTy &Data) { + SemaOpenMP::OpenMPVarListDataTy &Data) { UnqualifiedId UnqualifiedReductionId; bool InvalidReductionId = false; bool IsInvalidMapperModifier = false; @@ -4961,7 +4977,7 @@ OMPClause *Parser::ParseOpenMPVarListClause(OpenMPDirectiveKind DKind, SourceLocation Loc = Tok.getLocation(); SourceLocation LOpen = ConsumeToken(); SmallVector Vars; - Sema::OpenMPVarListDataTy Data; + SemaOpenMP::OpenMPVarListDataTy Data; if (ParseOpenMPVarList(DKind, Kind, Vars, Data)) return nullptr; @@ -4969,5 +4985,5 @@ OMPClause *Parser::ParseOpenMPVarListClause(OpenMPDirectiveKind DKind, if (ParseOnly) return nullptr; OMPVarListLocTy Locs(Loc, LOpen, Data.RLoc); - return Actions.ActOnOpenMPVarListClause(Kind, Vars, Locs, Data); + return Actions.OpenMP().ActOnOpenMPVarListClause(Kind, Vars, Locs, Data); } diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index 76a3fa8f2627de..629421c01d17d2 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -22,6 +22,7 @@ #include "clang/Sema/DeclSpec.h" #include "clang/Sema/EnterExpressionEvaluationContext.h" #include "clang/Sema/Scope.h" +#include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/TypoCorrection.h" #include "llvm/ADT/STLExtras.h" #include @@ -2301,7 +2302,7 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) { // In OpenMP loop region loop control variable must be captured and be // private. Perform analysis of first part (if any). if (getLangOpts().OpenMP && FirstPart.isUsable()) { - Actions.ActOnOpenMPLoopInitialization(ForLoc, FirstPart.get()); + Actions.OpenMP().ActOnOpenMPLoopInitialization(ForLoc, FirstPart.get()); } } diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 8de202f4f7a0c3..a1e32d391ed0cc 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -46,6 +46,7 @@ #include "clang/Sema/SemaHLSL.h" #include "clang/Sema/SemaInternal.h" #include "clang/Sema/SemaOpenACC.h" +#include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/SemaSYCL.h" #include "clang/Sema/TemplateDeduction.h" #include "clang/Sema/TemplateInstCallback.h" @@ -203,6 +204,7 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer, CUDAPtr(std::make_unique(*this)), HLSLPtr(std::make_unique(*this)), OpenACCPtr(std::make_unique(*this)), + OpenMPPtr(std::make_unique(*this)), SYCLPtr(std::make_unique(*this)), MSPointerToMemberRepresentationMethod( LangOpts.getMSPointerToMemberRepresentationMethod()), @@ -226,8 +228,7 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer, StringWithUTF8StringMethod(nullptr), ValueWithBytesObjCTypeMethod(nullptr), NSArrayDecl(nullptr), ArrayWithObjectsMethod(nullptr), NSDictionaryDecl(nullptr), - DictionaryWithObjectsMethod(nullptr), CodeCompleter(CodeCompleter), - VarDataSharingAttributesStack(nullptr) { + DictionaryWithObjectsMethod(nullptr), CodeCompleter(CodeCompleter) { assert(pp.TUKind == TUKind); TUScope = nullptr; @@ -252,7 +253,7 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer, nullptr, ExpressionEvaluationContextRecord::EK_Other); // Initialization of data sharing attributes stack for OpenMP - InitDataSharingAttributesStack(); + OpenMP().InitDataSharingAttributesStack(); std::unique_ptr Callbacks = std::make_unique(); @@ -501,7 +502,7 @@ Sema::~Sema() { threadSafety::threadSafetyCleanup(ThreadSafetyDeclCache); // Destroys data sharing attributes stack for OpenMP - DestroyDataSharingAttributesStack(); + OpenMP().DestroyDataSharingAttributesStack(); // Detach from the PP callback handler which outlives Sema since it's owned // by the preprocessor. @@ -1159,7 +1160,7 @@ void Sema::ActOnEndOfTranslationUnit() { DiagnoseUnterminatedPragmaAlignPack(); DiagnoseUnterminatedPragmaAttribute(); - DiagnoseUnterminatedOpenMPDeclareTarget(); + OpenMP().DiagnoseUnterminatedOpenMPDeclareTarget(); // All delayed member exception specs should be checked or we end up accepting // incompatible declarations. @@ -1747,7 +1748,7 @@ class DeferredDiagnosticsEmitter // Finalize analysis of OpenMP-specific constructs. if (Caller && S.LangOpts.OpenMP && UsePath.size() == 1 && (ShouldEmitRootNode || InOMPDeviceContext)) - S.finalizeOpenMPDelayedAnalysis(Caller, FD, Loc); + S.OpenMP().finalizeOpenMPDelayedAnalysis(Caller, FD, Loc); if (Caller) S.CUDA().DeviceKnownEmittedFns[FD] = {Caller, Loc}; // Always emit deferred diagnostics for the direct users. This does not @@ -1899,8 +1900,8 @@ Sema::targetDiag(SourceLocation Loc, unsigned DiagID, const FunctionDecl *FD) { FD = FD ? FD : getCurFunctionDecl(); if (LangOpts.OpenMP) return LangOpts.OpenMPIsTargetDevice - ? diagIfOpenMPDeviceCode(Loc, DiagID, FD) - : diagIfOpenMPHostCode(Loc, DiagID, FD); + ? OpenMP().diagIfOpenMPDeviceCode(Loc, DiagID, FD) + : OpenMP().diagIfOpenMPHostCode(Loc, DiagID, FD); if (getLangOpts().CUDA) return getLangOpts().CUDAIsDevice ? CUDA().DiagIfDeviceCode(Loc, DiagID) : CUDA().DiagIfHostCode(Loc, DiagID); @@ -2131,7 +2132,7 @@ void Sema::PushFunctionScope() { FunctionScopes.push_back(new FunctionScopeInfo(getDiagnostics())); } if (LangOpts.OpenMP) - pushOpenMPFunctionRegion(); + OpenMP().pushOpenMPFunctionRegion(); } void Sema::PushBlockScope(Scope *BlockScope, BlockDecl *Block) { @@ -2251,7 +2252,7 @@ Sema::PopFunctionScopeInfo(const AnalysisBasedWarnings::Policy *WP, PoppedFunctionScopeDeleter(this)); if (LangOpts.OpenMP) - popOpenMPFunctionRegion(Scope.get()); + OpenMP().popOpenMPFunctionRegion(Scope.get()); // Issue any analysis-based warnings. if (WP && D) @@ -2687,7 +2688,9 @@ void Sema::PushCapturedRegionScope(Scope *S, CapturedDecl *CD, RecordDecl *RD, unsigned OpenMPCaptureLevel) { auto *CSI = new CapturedRegionScopeInfo( getDiagnostics(), S, CD, RD, CD->getContextParam(), K, - (getLangOpts().OpenMP && K == CR_OpenMP) ? getOpenMPNestingLevel() : 0, + (getLangOpts().OpenMP && K == CR_OpenMP) + ? OpenMP().getOpenMPNestingLevel() + : 0, OpenMPCaptureLevel); CSI->ReturnType = Context.VoidTy; FunctionScopes.push_back(CSI); diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 8b3b9d020db572..390da508518e16 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -48,6 +48,7 @@ #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaHLSL.h" #include "clang/Sema/SemaInternal.h" +#include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/Template.h" #include "llvm/ADT/STLForwardCompat.h" #include "llvm/ADT/SmallString.h" @@ -6168,11 +6169,12 @@ Decl *Sema::ActOnDeclarator(Scope *S, Declarator &D) { // Check if we are in an `omp begin/end declare variant` scope. Handle this // declaration only if the `bind_to_declaration` extension is set. SmallVector Bases; - if (LangOpts.OpenMP && isInOpenMPDeclareVariantScope()) - if (getOMPTraitInfoForSurroundingScope()->isExtensionActive(llvm::omp::TraitProperty:: - implementation_extension_bind_to_declaration)) - ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( - S, D, MultiTemplateParamsArg(), Bases); + if (LangOpts.OpenMP && OpenMP().isInOpenMPDeclareVariantScope()) + if (OpenMP().getOMPTraitInfoForSurroundingScope()->isExtensionActive( + llvm::omp::TraitProperty:: + implementation_extension_bind_to_declaration)) + OpenMP().ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( + S, D, MultiTemplateParamsArg(), Bases); Decl *Dcl = HandleDeclarator(S, D, MultiTemplateParamsArg()); @@ -6181,7 +6183,8 @@ Decl *Sema::ActOnDeclarator(Scope *S, Declarator &D) { Dcl->setTopLevelDeclInObjCContainer(); if (!Bases.empty()) - ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(Dcl, Bases); + OpenMP().ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(Dcl, + Bases); return Dcl; } @@ -6568,8 +6571,8 @@ NamedDecl *Sema::HandleDeclarator(Scope *S, Declarator &D, if (New->getDeclName() && AddToScope) PushOnScopeChains(New, S); - if (isInOpenMPDeclareTargetContext()) - checkDeclIsAllowedInOpenMPTarget(nullptr, New); + if (OpenMP().isInOpenMPDeclareTargetContext()) + OpenMP().checkDeclIsAllowedInOpenMPTarget(nullptr, New); return New; } @@ -12268,7 +12271,7 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD, } if (LangOpts.OpenMP) - ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(NewFD); + OpenMP().ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(NewFD); // Semantic checking for this function declaration (in isolation). @@ -14956,7 +14959,7 @@ Sema::DeclGroupPtrTy Sema::FinalizeDeclaratorGroup(Scope *S, const DeclSpec &DS, if (auto *VD = dyn_cast(D); LangOpts.OpenMP && VD && VD->hasAttr() && VD->hasGlobalStorage()) - ActOnOpenMPDeclareTargetInitializer(D); + OpenMP().ActOnOpenMPDeclareTargetInitializer(D); // For declarators, there are some additional syntactic-ish checks we need // to perform. if (auto *DD = dyn_cast(D)) { @@ -15495,8 +15498,8 @@ Sema::ActOnStartOfFunctionDef(Scope *FnBodyScope, Declarator &D, // specialization function under the OpenMP context defined as part of the // `omp begin declare variant`. SmallVector Bases; - if (LangOpts.OpenMP && isInOpenMPDeclareVariantScope()) - ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( + if (LangOpts.OpenMP && OpenMP().isInOpenMPDeclareVariantScope()) + OpenMP().ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( ParentScope, D, TemplateParameterLists, Bases); D.setFunctionDefinitionKind(FunctionDefinitionKind::Definition); @@ -15504,7 +15507,8 @@ Sema::ActOnStartOfFunctionDef(Scope *FnBodyScope, Declarator &D, Decl *Dcl = ActOnStartOfFunctionDef(FnBodyScope, DP, SkipBody, BodyKind); if (!Bases.empty()) - ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(Dcl, Bases); + OpenMP().ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(Dcl, + Bases); return Dcl; } @@ -20651,7 +20655,7 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(const FunctionDecl *FD, return FunctionEmissionStatus::OMPDiscarded; // If we have an explicit value for the device type, or we are in a target // declare context, we need to emit all extern and used symbols. - if (isInOpenMPDeclareTargetContext() || DevTy) + if (OpenMP().isInOpenMPDeclareTargetContext() || DevTy) if (IsEmittedForExternalSymbol()) return FunctionEmissionStatus::Emitted; // Device mode only emits what it must, if it wasn't tagged yet and needed, diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 7669171fea56ff..8c6bae545bfd15 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -44,6 +44,7 @@ #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaInternal.h" +#include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/Template.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -962,8 +963,8 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D, CurContext->addHiddenDecl(New); } - if (isInOpenMPDeclareTargetContext()) - checkDeclIsAllowedInOpenMPTarget(nullptr, New); + if (OpenMP().isInOpenMPDeclareTargetContext()) + OpenMP().checkDeclIsAllowedInOpenMPTarget(nullptr, New); return New; } @@ -18654,8 +18655,8 @@ void Sema::MarkVTableUsed(SourceLocation Loc, CXXRecordDecl *Class, // Do not mark as used if compiling for the device outside of the target // region. if (TUKind != TU_Prefix && LangOpts.OpenMP && LangOpts.OpenMPIsTargetDevice && - !isInOpenMPDeclareTargetContext() && - !isInOpenMPTargetExecutionDirective()) { + !OpenMP().isInOpenMPDeclareTargetContext() && + !OpenMP().isInOpenMPTargetExecutionDirective()) { if (!DefinitionRequired) MarkVirtualMembersReferenced(Loc, Class); return; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 189764cb4b6b08..cabffa47c93185 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -52,6 +52,7 @@ #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaFixItUtils.h" #include "clang/Sema/SemaInternal.h" +#include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/Template.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/STLForwardCompat.h" @@ -360,9 +361,9 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef Locs, // at the same location. // [OpenMP 5.2] Also allow iterator declared variables. if (LangOpts.OpenMP && isa(D) && - !isOpenMPDeclareMapperVarDeclAllowed(cast(D))) { + !OpenMP().isOpenMPDeclareMapperVarDeclAllowed(cast(D))) { Diag(Loc, diag::err_omp_declare_mapper_wrong_var) - << getOpenMPDeclareMapperVarName(); + << OpenMP().getOpenMPDeclareMapperVarName(); Diag(D->getLocation(), diag::note_entity_declared_at) << D; return true; } @@ -2267,7 +2268,7 @@ NonOdrUseReason Sema::getNonOdrUseReasonInCurrentContext(ValueDecl *D) { // be loaded from the captured. if (VarDecl *VD = dyn_cast(D)) { if (VD->getType()->isReferenceType() && - !(getLangOpts().OpenMP && isOpenMPCapturedDecl(D)) && + !(getLangOpts().OpenMP && OpenMP().isOpenMPCapturedDecl(D)) && !isCapturingReferenceToHostVarInCUDADeviceLambda(*this, VD) && VD->isUsableInConstantExpressions(Context)) return NOUR_Constant; @@ -5080,9 +5081,10 @@ ExprResult Sema::ActOnArraySubscriptExpr(Scope *S, Expr *base, if (base && !base->getType().isNull() && base->hasPlaceholderType(BuiltinType::OMPArraySection)) - return ActOnOMPArraySectionExpr(base, lbLoc, ArgExprs.front(), SourceLocation(), - SourceLocation(), /*Length*/ nullptr, - /*Stride=*/nullptr, rbLoc); + return OpenMP().ActOnOMPArraySectionExpr(base, lbLoc, ArgExprs.front(), + SourceLocation(), SourceLocation(), + /*Length*/ nullptr, + /*Stride=*/nullptr, rbLoc); // Since this might be a postfix expression, get rid of ParenListExprs. if (isa(base)) { @@ -5354,558 +5356,6 @@ void Sema::CheckSubscriptAccessOfNoDeref(const ArraySubscriptExpr *E) { } } -ExprResult Sema::ActOnOMPArraySectionExpr(Expr *Base, SourceLocation LBLoc, - Expr *LowerBound, - SourceLocation ColonLocFirst, - SourceLocation ColonLocSecond, - Expr *Length, Expr *Stride, - SourceLocation RBLoc) { - if (Base->hasPlaceholderType() && - !Base->hasPlaceholderType(BuiltinType::OMPArraySection)) { - ExprResult Result = CheckPlaceholderExpr(Base); - if (Result.isInvalid()) - return ExprError(); - Base = Result.get(); - } - if (LowerBound && LowerBound->getType()->isNonOverloadPlaceholderType()) { - ExprResult Result = CheckPlaceholderExpr(LowerBound); - if (Result.isInvalid()) - return ExprError(); - Result = DefaultLvalueConversion(Result.get()); - if (Result.isInvalid()) - return ExprError(); - LowerBound = Result.get(); - } - if (Length && Length->getType()->isNonOverloadPlaceholderType()) { - ExprResult Result = CheckPlaceholderExpr(Length); - if (Result.isInvalid()) - return ExprError(); - Result = DefaultLvalueConversion(Result.get()); - if (Result.isInvalid()) - return ExprError(); - Length = Result.get(); - } - if (Stride && Stride->getType()->isNonOverloadPlaceholderType()) { - ExprResult Result = CheckPlaceholderExpr(Stride); - if (Result.isInvalid()) - return ExprError(); - Result = DefaultLvalueConversion(Result.get()); - if (Result.isInvalid()) - return ExprError(); - Stride = Result.get(); - } - - // Build an unanalyzed expression if either operand is type-dependent. - if (Base->isTypeDependent() || - (LowerBound && - (LowerBound->isTypeDependent() || LowerBound->isValueDependent())) || - (Length && (Length->isTypeDependent() || Length->isValueDependent())) || - (Stride && (Stride->isTypeDependent() || Stride->isValueDependent()))) { - return new (Context) OMPArraySectionExpr( - Base, LowerBound, Length, Stride, Context.DependentTy, VK_LValue, - OK_Ordinary, ColonLocFirst, ColonLocSecond, RBLoc); - } - - // Perform default conversions. - QualType OriginalTy = OMPArraySectionExpr::getBaseOriginalType(Base); - QualType ResultTy; - if (OriginalTy->isAnyPointerType()) { - ResultTy = OriginalTy->getPointeeType(); - } else if (OriginalTy->isArrayType()) { - ResultTy = OriginalTy->getAsArrayTypeUnsafe()->getElementType(); - } else { - return ExprError( - Diag(Base->getExprLoc(), diag::err_omp_typecheck_section_value) - << Base->getSourceRange()); - } - // C99 6.5.2.1p1 - if (LowerBound) { - auto Res = PerformOpenMPImplicitIntegerConversion(LowerBound->getExprLoc(), - LowerBound); - if (Res.isInvalid()) - return ExprError(Diag(LowerBound->getExprLoc(), - diag::err_omp_typecheck_section_not_integer) - << 0 << LowerBound->getSourceRange()); - LowerBound = Res.get(); - - if (LowerBound->getType()->isSpecificBuiltinType(BuiltinType::Char_S) || - LowerBound->getType()->isSpecificBuiltinType(BuiltinType::Char_U)) - Diag(LowerBound->getExprLoc(), diag::warn_omp_section_is_char) - << 0 << LowerBound->getSourceRange(); - } - if (Length) { - auto Res = - PerformOpenMPImplicitIntegerConversion(Length->getExprLoc(), Length); - if (Res.isInvalid()) - return ExprError(Diag(Length->getExprLoc(), - diag::err_omp_typecheck_section_not_integer) - << 1 << Length->getSourceRange()); - Length = Res.get(); - - if (Length->getType()->isSpecificBuiltinType(BuiltinType::Char_S) || - Length->getType()->isSpecificBuiltinType(BuiltinType::Char_U)) - Diag(Length->getExprLoc(), diag::warn_omp_section_is_char) - << 1 << Length->getSourceRange(); - } - if (Stride) { - ExprResult Res = - PerformOpenMPImplicitIntegerConversion(Stride->getExprLoc(), Stride); - if (Res.isInvalid()) - return ExprError(Diag(Stride->getExprLoc(), - diag::err_omp_typecheck_section_not_integer) - << 1 << Stride->getSourceRange()); - Stride = Res.get(); - - if (Stride->getType()->isSpecificBuiltinType(BuiltinType::Char_S) || - Stride->getType()->isSpecificBuiltinType(BuiltinType::Char_U)) - Diag(Stride->getExprLoc(), diag::warn_omp_section_is_char) - << 1 << Stride->getSourceRange(); - } - - // C99 6.5.2.1p1: "shall have type "pointer to *object* type". Similarly, - // C++ [expr.sub]p1: The type "T" shall be a completely-defined object - // type. Note that functions are not objects, and that (in C99 parlance) - // incomplete types are not object types. - if (ResultTy->isFunctionType()) { - Diag(Base->getExprLoc(), diag::err_omp_section_function_type) - << ResultTy << Base->getSourceRange(); - return ExprError(); - } - - if (RequireCompleteType(Base->getExprLoc(), ResultTy, - diag::err_omp_section_incomplete_type, Base)) - return ExprError(); - - if (LowerBound && !OriginalTy->isAnyPointerType()) { - Expr::EvalResult Result; - if (LowerBound->EvaluateAsInt(Result, Context)) { - // OpenMP 5.0, [2.1.5 Array Sections] - // The array section must be a subset of the original array. - llvm::APSInt LowerBoundValue = Result.Val.getInt(); - if (LowerBoundValue.isNegative()) { - Diag(LowerBound->getExprLoc(), diag::err_omp_section_not_subset_of_array) - << LowerBound->getSourceRange(); - return ExprError(); - } - } - } - - if (Length) { - Expr::EvalResult Result; - if (Length->EvaluateAsInt(Result, Context)) { - // OpenMP 5.0, [2.1.5 Array Sections] - // The length must evaluate to non-negative integers. - llvm::APSInt LengthValue = Result.Val.getInt(); - if (LengthValue.isNegative()) { - Diag(Length->getExprLoc(), diag::err_omp_section_length_negative) - << toString(LengthValue, /*Radix=*/10, /*Signed=*/true) - << Length->getSourceRange(); - return ExprError(); - } - } - } else if (ColonLocFirst.isValid() && - (OriginalTy.isNull() || (!OriginalTy->isConstantArrayType() && - !OriginalTy->isVariableArrayType()))) { - // OpenMP 5.0, [2.1.5 Array Sections] - // When the size of the array dimension is not known, the length must be - // specified explicitly. - Diag(ColonLocFirst, diag::err_omp_section_length_undefined) - << (!OriginalTy.isNull() && OriginalTy->isArrayType()); - return ExprError(); - } - - if (Stride) { - Expr::EvalResult Result; - if (Stride->EvaluateAsInt(Result, Context)) { - // OpenMP 5.0, [2.1.5 Array Sections] - // The stride must evaluate to a positive integer. - llvm::APSInt StrideValue = Result.Val.getInt(); - if (!StrideValue.isStrictlyPositive()) { - Diag(Stride->getExprLoc(), diag::err_omp_section_stride_non_positive) - << toString(StrideValue, /*Radix=*/10, /*Signed=*/true) - << Stride->getSourceRange(); - return ExprError(); - } - } - } - - if (!Base->hasPlaceholderType(BuiltinType::OMPArraySection)) { - ExprResult Result = DefaultFunctionArrayLvalueConversion(Base); - if (Result.isInvalid()) - return ExprError(); - Base = Result.get(); - } - return new (Context) OMPArraySectionExpr( - Base, LowerBound, Length, Stride, Context.OMPArraySectionTy, VK_LValue, - OK_Ordinary, ColonLocFirst, ColonLocSecond, RBLoc); -} - -ExprResult Sema::ActOnOMPArrayShapingExpr(Expr *Base, SourceLocation LParenLoc, - SourceLocation RParenLoc, - ArrayRef Dims, - ArrayRef Brackets) { - if (Base->hasPlaceholderType()) { - ExprResult Result = CheckPlaceholderExpr(Base); - if (Result.isInvalid()) - return ExprError(); - Result = DefaultLvalueConversion(Result.get()); - if (Result.isInvalid()) - return ExprError(); - Base = Result.get(); - } - QualType BaseTy = Base->getType(); - // Delay analysis of the types/expressions if instantiation/specialization is - // required. - if (!BaseTy->isPointerType() && Base->isTypeDependent()) - return OMPArrayShapingExpr::Create(Context, Context.DependentTy, Base, - LParenLoc, RParenLoc, Dims, Brackets); - if (!BaseTy->isPointerType() || - (!Base->isTypeDependent() && - BaseTy->getPointeeType()->isIncompleteType())) - return ExprError(Diag(Base->getExprLoc(), - diag::err_omp_non_pointer_type_array_shaping_base) - << Base->getSourceRange()); - - SmallVector NewDims; - bool ErrorFound = false; - for (Expr *Dim : Dims) { - if (Dim->hasPlaceholderType()) { - ExprResult Result = CheckPlaceholderExpr(Dim); - if (Result.isInvalid()) { - ErrorFound = true; - continue; - } - Result = DefaultLvalueConversion(Result.get()); - if (Result.isInvalid()) { - ErrorFound = true; - continue; - } - Dim = Result.get(); - } - if (!Dim->isTypeDependent()) { - ExprResult Result = - PerformOpenMPImplicitIntegerConversion(Dim->getExprLoc(), Dim); - if (Result.isInvalid()) { - ErrorFound = true; - Diag(Dim->getExprLoc(), diag::err_omp_typecheck_shaping_not_integer) - << Dim->getSourceRange(); - continue; - } - Dim = Result.get(); - Expr::EvalResult EvResult; - if (!Dim->isValueDependent() && Dim->EvaluateAsInt(EvResult, Context)) { - // OpenMP 5.0, [2.1.4 Array Shaping] - // Each si is an integral type expression that must evaluate to a - // positive integer. - llvm::APSInt Value = EvResult.Val.getInt(); - if (!Value.isStrictlyPositive()) { - Diag(Dim->getExprLoc(), diag::err_omp_shaping_dimension_not_positive) - << toString(Value, /*Radix=*/10, /*Signed=*/true) - << Dim->getSourceRange(); - ErrorFound = true; - continue; - } - } - } - NewDims.push_back(Dim); - } - if (ErrorFound) - return ExprError(); - return OMPArrayShapingExpr::Create(Context, Context.OMPArrayShapingTy, Base, - LParenLoc, RParenLoc, NewDims, Brackets); -} - -ExprResult Sema::ActOnOMPIteratorExpr(Scope *S, SourceLocation IteratorKwLoc, - SourceLocation LLoc, SourceLocation RLoc, - ArrayRef Data) { - SmallVector ID; - bool IsCorrect = true; - for (const OMPIteratorData &D : Data) { - TypeSourceInfo *TInfo = nullptr; - SourceLocation StartLoc; - QualType DeclTy; - if (!D.Type.getAsOpaquePtr()) { - // OpenMP 5.0, 2.1.6 Iterators - // In an iterator-specifier, if the iterator-type is not specified then - // the type of that iterator is of int type. - DeclTy = Context.IntTy; - StartLoc = D.DeclIdentLoc; - } else { - DeclTy = GetTypeFromParser(D.Type, &TInfo); - StartLoc = TInfo->getTypeLoc().getBeginLoc(); - } - - bool IsDeclTyDependent = DeclTy->isDependentType() || - DeclTy->containsUnexpandedParameterPack() || - DeclTy->isInstantiationDependentType(); - if (!IsDeclTyDependent) { - if (!DeclTy->isIntegralType(Context) && !DeclTy->isAnyPointerType()) { - // OpenMP 5.0, 2.1.6 Iterators, Restrictions, C/C++ - // The iterator-type must be an integral or pointer type. - Diag(StartLoc, diag::err_omp_iterator_not_integral_or_pointer) - << DeclTy; - IsCorrect = false; - continue; - } - if (DeclTy.isConstant(Context)) { - // OpenMP 5.0, 2.1.6 Iterators, Restrictions, C/C++ - // The iterator-type must not be const qualified. - Diag(StartLoc, diag::err_omp_iterator_not_integral_or_pointer) - << DeclTy; - IsCorrect = false; - continue; - } - } - - // Iterator declaration. - assert(D.DeclIdent && "Identifier expected."); - // Always try to create iterator declarator to avoid extra error messages - // about unknown declarations use. - auto *VD = VarDecl::Create(Context, CurContext, StartLoc, D.DeclIdentLoc, - D.DeclIdent, DeclTy, TInfo, SC_None); - VD->setImplicit(); - if (S) { - // Check for conflicting previous declaration. - DeclarationNameInfo NameInfo(VD->getDeclName(), D.DeclIdentLoc); - LookupResult Previous(*this, NameInfo, LookupOrdinaryName, - ForVisibleRedeclaration); - Previous.suppressDiagnostics(); - LookupName(Previous, S); - - FilterLookupForScope(Previous, CurContext, S, /*ConsiderLinkage=*/false, - /*AllowInlineNamespace=*/false); - if (!Previous.empty()) { - NamedDecl *Old = Previous.getRepresentativeDecl(); - Diag(D.DeclIdentLoc, diag::err_redefinition) << VD->getDeclName(); - Diag(Old->getLocation(), diag::note_previous_definition); - } else { - PushOnScopeChains(VD, S); - } - } else { - CurContext->addDecl(VD); - } - - /// Act on the iterator variable declaration. - ActOnOpenMPIteratorVarDecl(VD); - - Expr *Begin = D.Range.Begin; - if (!IsDeclTyDependent && Begin && !Begin->isTypeDependent()) { - ExprResult BeginRes = - PerformImplicitConversion(Begin, DeclTy, AA_Converting); - Begin = BeginRes.get(); - } - Expr *End = D.Range.End; - if (!IsDeclTyDependent && End && !End->isTypeDependent()) { - ExprResult EndRes = PerformImplicitConversion(End, DeclTy, AA_Converting); - End = EndRes.get(); - } - Expr *Step = D.Range.Step; - if (!IsDeclTyDependent && Step && !Step->isTypeDependent()) { - if (!Step->getType()->isIntegralType(Context)) { - Diag(Step->getExprLoc(), diag::err_omp_iterator_step_not_integral) - << Step << Step->getSourceRange(); - IsCorrect = false; - continue; - } - std::optional Result = - Step->getIntegerConstantExpr(Context); - // OpenMP 5.0, 2.1.6 Iterators, Restrictions - // If the step expression of a range-specification equals zero, the - // behavior is unspecified. - if (Result && Result->isZero()) { - Diag(Step->getExprLoc(), diag::err_omp_iterator_step_constant_zero) - << Step << Step->getSourceRange(); - IsCorrect = false; - continue; - } - } - if (!Begin || !End || !IsCorrect) { - IsCorrect = false; - continue; - } - OMPIteratorExpr::IteratorDefinition &IDElem = ID.emplace_back(); - IDElem.IteratorDecl = VD; - IDElem.AssignmentLoc = D.AssignLoc; - IDElem.Range.Begin = Begin; - IDElem.Range.End = End; - IDElem.Range.Step = Step; - IDElem.ColonLoc = D.ColonLoc; - IDElem.SecondColonLoc = D.SecColonLoc; - } - if (!IsCorrect) { - // Invalidate all created iterator declarations if error is found. - for (const OMPIteratorExpr::IteratorDefinition &D : ID) { - if (Decl *ID = D.IteratorDecl) - ID->setInvalidDecl(); - } - return ExprError(); - } - SmallVector Helpers; - if (!CurContext->isDependentContext()) { - // Build number of ityeration for each iteration range. - // Ni = ((Stepi > 0) ? ((Endi + Stepi -1 - Begini)/Stepi) : - // ((Begini-Stepi-1-Endi) / -Stepi); - for (OMPIteratorExpr::IteratorDefinition &D : ID) { - // (Endi - Begini) - ExprResult Res = CreateBuiltinBinOp(D.AssignmentLoc, BO_Sub, D.Range.End, - D.Range.Begin); - if(!Res.isUsable()) { - IsCorrect = false; - continue; - } - ExprResult St, St1; - if (D.Range.Step) { - St = D.Range.Step; - // (Endi - Begini) + Stepi - Res = CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, Res.get(), St.get()); - if (!Res.isUsable()) { - IsCorrect = false; - continue; - } - // (Endi - Begini) + Stepi - 1 - Res = - CreateBuiltinBinOp(D.AssignmentLoc, BO_Sub, Res.get(), - ActOnIntegerConstant(D.AssignmentLoc, 1).get()); - if (!Res.isUsable()) { - IsCorrect = false; - continue; - } - // ((Endi - Begini) + Stepi - 1) / Stepi - Res = CreateBuiltinBinOp(D.AssignmentLoc, BO_Div, Res.get(), St.get()); - if (!Res.isUsable()) { - IsCorrect = false; - continue; - } - St1 = CreateBuiltinUnaryOp(D.AssignmentLoc, UO_Minus, D.Range.Step); - // (Begini - Endi) - ExprResult Res1 = CreateBuiltinBinOp(D.AssignmentLoc, BO_Sub, - D.Range.Begin, D.Range.End); - if (!Res1.isUsable()) { - IsCorrect = false; - continue; - } - // (Begini - Endi) - Stepi - Res1 = - CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, Res1.get(), St1.get()); - if (!Res1.isUsable()) { - IsCorrect = false; - continue; - } - // (Begini - Endi) - Stepi - 1 - Res1 = - CreateBuiltinBinOp(D.AssignmentLoc, BO_Sub, Res1.get(), - ActOnIntegerConstant(D.AssignmentLoc, 1).get()); - if (!Res1.isUsable()) { - IsCorrect = false; - continue; - } - // ((Begini - Endi) - Stepi - 1) / (-Stepi) - Res1 = - CreateBuiltinBinOp(D.AssignmentLoc, BO_Div, Res1.get(), St1.get()); - if (!Res1.isUsable()) { - IsCorrect = false; - continue; - } - // Stepi > 0. - ExprResult CmpRes = - CreateBuiltinBinOp(D.AssignmentLoc, BO_GT, D.Range.Step, - ActOnIntegerConstant(D.AssignmentLoc, 0).get()); - if (!CmpRes.isUsable()) { - IsCorrect = false; - continue; - } - Res = ActOnConditionalOp(D.AssignmentLoc, D.AssignmentLoc, CmpRes.get(), - Res.get(), Res1.get()); - if (!Res.isUsable()) { - IsCorrect = false; - continue; - } - } - Res = ActOnFinishFullExpr(Res.get(), /*DiscardedValue=*/false); - if (!Res.isUsable()) { - IsCorrect = false; - continue; - } - - // Build counter update. - // Build counter. - auto *CounterVD = - VarDecl::Create(Context, CurContext, D.IteratorDecl->getBeginLoc(), - D.IteratorDecl->getBeginLoc(), nullptr, - Res.get()->getType(), nullptr, SC_None); - CounterVD->setImplicit(); - ExprResult RefRes = - BuildDeclRefExpr(CounterVD, CounterVD->getType(), VK_LValue, - D.IteratorDecl->getBeginLoc()); - // Build counter update. - // I = Begini + counter * Stepi; - ExprResult UpdateRes; - if (D.Range.Step) { - UpdateRes = CreateBuiltinBinOp( - D.AssignmentLoc, BO_Mul, - DefaultLvalueConversion(RefRes.get()).get(), St.get()); - } else { - UpdateRes = DefaultLvalueConversion(RefRes.get()); - } - if (!UpdateRes.isUsable()) { - IsCorrect = false; - continue; - } - UpdateRes = CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, D.Range.Begin, - UpdateRes.get()); - if (!UpdateRes.isUsable()) { - IsCorrect = false; - continue; - } - ExprResult VDRes = - BuildDeclRefExpr(cast(D.IteratorDecl), - cast(D.IteratorDecl)->getType(), VK_LValue, - D.IteratorDecl->getBeginLoc()); - UpdateRes = CreateBuiltinBinOp(D.AssignmentLoc, BO_Assign, VDRes.get(), - UpdateRes.get()); - if (!UpdateRes.isUsable()) { - IsCorrect = false; - continue; - } - UpdateRes = - ActOnFinishFullExpr(UpdateRes.get(), /*DiscardedValue=*/true); - if (!UpdateRes.isUsable()) { - IsCorrect = false; - continue; - } - ExprResult CounterUpdateRes = - CreateBuiltinUnaryOp(D.AssignmentLoc, UO_PreInc, RefRes.get()); - if (!CounterUpdateRes.isUsable()) { - IsCorrect = false; - continue; - } - CounterUpdateRes = - ActOnFinishFullExpr(CounterUpdateRes.get(), /*DiscardedValue=*/true); - if (!CounterUpdateRes.isUsable()) { - IsCorrect = false; - continue; - } - OMPIteratorHelperData &HD = Helpers.emplace_back(); - HD.CounterVD = CounterVD; - HD.Upper = Res.get(); - HD.Update = UpdateRes.get(); - HD.CounterUpdate = CounterUpdateRes.get(); - } - } else { - Helpers.assign(ID.size(), {}); - } - if (!IsCorrect) { - // Invalidate all created iterator declarations if error is found. - for (const OMPIteratorExpr::IteratorDefinition &D : ID) { - if (Decl *ID = D.IteratorDecl) - ID->setInvalidDecl(); - } - return ExprError(); - } - return OMPIteratorExpr::Create(Context, Context.OMPIteratorTy, IteratorKwLoc, - LLoc, RLoc, ID, Helpers); -} - ExprResult Sema::CreateBuiltinArraySubscriptExpr(Expr *Base, SourceLocation LLoc, Expr *Idx, SourceLocation RLoc) { @@ -7190,8 +6640,8 @@ ExprResult Sema::ActOnCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, } if (LangOpts.OpenMP) - Call = ActOnOpenMPCall(Call, Scope, LParenLoc, ArgExprs, RParenLoc, - ExecConfig); + Call = OpenMP().ActOnOpenMPCall(Call, Scope, LParenLoc, ArgExprs, RParenLoc, + ExecConfig); if (LangOpts.CPlusPlus) { if (const auto *CE = dyn_cast(Call.get())) DiagnosedUnqualifiedCallsToStdFunctions(*this, CE); @@ -19193,7 +18643,7 @@ MarkVarDeclODRUsed(ValueDecl *V, SourceLocation Loc, Sema &SemaRef, } QualType CaptureType, DeclRefType; if (SemaRef.LangOpts.OpenMP) - SemaRef.tryCaptureOpenMPLambdas(V); + SemaRef.OpenMP().tryCaptureOpenMPLambdas(V); SemaRef.tryCaptureVariable(V, Loc, Sema::TryCapture_Implicit, /*EllipsisLoc*/ SourceLocation(), /*BuildAndDiagnose*/ true, CaptureType, @@ -19474,7 +18924,7 @@ static bool captureInBlock(BlockScopeInfo *BSI, ValueDecl *Var, const bool HasBlocksAttr = Var->hasAttr(); if (HasBlocksAttr || CaptureType->isReferenceType() || - (S.getLangOpts().OpenMP && S.isOpenMPCapturedDecl(Var))) { + (S.getLangOpts().OpenMP && S.OpenMP().isOpenMPCapturedDecl(Var))) { // Block capture by reference does not change the capture or // declaration reference types. ByRef = true; @@ -19504,7 +18954,7 @@ static bool captureInCapturedRegion( ByRef = (Kind == Sema::TryCapture_ExplicitByRef); } else if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) { // Using an LValue reference type is consistent with Lambdas (see below). - if (S.isOpenMPCapturedDecl(Var)) { + if (S.OpenMP().isOpenMPCapturedDecl(Var)) { bool HasConst = DeclRefType.isConstQualified(); DeclRefType = DeclRefType.getUnqualifiedType(); // Don't lose diagnostics about assignments to const. @@ -19512,11 +18962,11 @@ static bool captureInCapturedRegion( DeclRefType.addConst(); } // Do not capture firstprivates in tasks. - if (S.isOpenMPPrivateDecl(Var, RSI->OpenMPLevel, RSI->OpenMPCaptureLevel) != - OMPC_unknown) + if (S.OpenMP().isOpenMPPrivateDecl(Var, RSI->OpenMPLevel, + RSI->OpenMPCaptureLevel) != OMPC_unknown) return true; - ByRef = S.isOpenMPCapturedByRef(Var, RSI->OpenMPLevel, - RSI->OpenMPCaptureLevel); + ByRef = S.OpenMP().isOpenMPCapturedByRef(Var, RSI->OpenMPLevel, + RSI->OpenMPCaptureLevel); } if (ByRef) @@ -19777,9 +19227,9 @@ bool Sema::tryCaptureVariable( // Capture global variables if it is required to use private copy of this // variable. bool IsGlobal = !VD->hasLocalStorage(); - if (IsGlobal && - !(LangOpts.OpenMP && isOpenMPCapturedDecl(Var, /*CheckScopeInfo=*/true, - MaxFunctionScopesIndex))) + if (IsGlobal && !(LangOpts.OpenMP && + OpenMP().isOpenMPCapturedDecl(Var, /*CheckScopeInfo=*/true, + MaxFunctionScopesIndex))) return true; if (isa(Var)) @@ -19897,7 +19347,7 @@ bool Sema::tryCaptureVariable( } return true; } - OpenMPClauseKind IsOpenMPPrivateDecl = isOpenMPPrivateDecl( + OpenMPClauseKind IsOpenMPPrivateDecl = OpenMP().isOpenMPPrivateDecl( Var, RSI->OpenMPLevel, RSI->OpenMPCaptureLevel); // If the variable is private (i.e. not captured) and has variably // modified type, we still need to capture the type for correct @@ -19908,7 +19358,8 @@ bool Sema::tryCaptureVariable( QualType QTy = Var->getType(); if (ParmVarDecl *PVD = dyn_cast_or_null(Var)) QTy = PVD->getOriginalType(); - for (int I = 1, E = getNumberOfConstructScopes(RSI->OpenMPLevel); + for (int I = 1, + E = OpenMP().getNumberOfConstructScopes(RSI->OpenMPLevel); I < E; ++I) { auto *OuterRSI = cast( FunctionScopes[FunctionScopesIndex - I]); @@ -19920,18 +19371,19 @@ bool Sema::tryCaptureVariable( } bool IsTargetCap = IsOpenMPPrivateDecl != OMPC_private && - isOpenMPTargetCapturedDecl(Var, RSI->OpenMPLevel, - RSI->OpenMPCaptureLevel); + OpenMP().isOpenMPTargetCapturedDecl(Var, RSI->OpenMPLevel, + RSI->OpenMPCaptureLevel); // Do not capture global if it is not privatized in outer regions. bool IsGlobalCap = - IsGlobal && isOpenMPGlobalCapturedDecl(Var, RSI->OpenMPLevel, - RSI->OpenMPCaptureLevel); + IsGlobal && OpenMP().isOpenMPGlobalCapturedDecl( + Var, RSI->OpenMPLevel, RSI->OpenMPCaptureLevel); // When we detect target captures we are looking from inside the // target region, therefore we need to propagate the capture from the // enclosing region. Therefore, the capture is not initially nested. if (IsTargetCap) - adjustOpenMPTargetScopeIndex(FunctionScopesIndex, RSI->OpenMPLevel); + OpenMP().adjustOpenMPTargetScopeIndex(FunctionScopesIndex, + RSI->OpenMPLevel); if (IsTargetCap || IsOpenMPPrivateDecl == OMPC_private || (IsGlobal && !IsGlobalCap)) { @@ -20753,8 +20205,8 @@ static void MarkExprReferenced(Sema &SemaRef, SourceLocation Loc, Decl *D, Expr *E, bool MightBeOdrUse, llvm::DenseMap &RefsMinusAssignments) { - if (SemaRef.isInOpenMPDeclareTargetContext()) - SemaRef.checkDeclIsAllowedInOpenMPTarget(E, D); + if (SemaRef.OpenMP().isInOpenMPDeclareTargetContext()) + SemaRef.OpenMP().checkDeclIsAllowedInOpenMPTarget(E, D); if (VarDecl *Var = dyn_cast(D)) { DoMarkVarDeclReferenced(SemaRef, Loc, Var, E, RefsMinusAssignments); diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp index 32998ae60eafe2..7ea6d733fe5a2d 100644 --- a/clang/lib/Sema/SemaExprMember.cpp +++ b/clang/lib/Sema/SemaExprMember.cpp @@ -9,7 +9,6 @@ // This file implements semantic analysis member access expressions. // //===----------------------------------------------------------------------===// -#include "clang/Sema/Overload.h" #include "clang/AST/ASTLambda.h" #include "clang/AST/DeclCXX.h" #include "clang/AST/DeclObjC.h" @@ -18,9 +17,11 @@ #include "clang/AST/ExprObjC.h" #include "clang/Lex/Preprocessor.h" #include "clang/Sema/Lookup.h" +#include "clang/Sema/Overload.h" #include "clang/Sema/Scope.h" #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/SemaInternal.h" +#include "clang/Sema/SemaOpenMP.h" using namespace clang; using namespace sema; @@ -1900,9 +1901,9 @@ Sema::BuildFieldReferenceExpr(Expr *BaseExpr, bool IsArrow, if (getLangOpts().OpenMP && IsArrow && !CurContext->isDependentContext() && isa(Base.get()->IgnoreParenImpCasts())) { - if (auto *PrivateCopy = isOpenMPCapturedDecl(Field)) { - return getOpenMPCapturedExpr(PrivateCopy, VK, OK, - MemberNameInfo.getLoc()); + if (auto *PrivateCopy = OpenMP().isOpenMPCapturedDecl(Field)) { + return OpenMP().getOpenMPCapturedExpr(PrivateCopy, VK, OK, + MemberNameInfo.getLoc()); } } diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp index 35a51c6c2328db..1743afaf15287f 100644 --- a/clang/lib/Sema/SemaLambda.cpp +++ b/clang/lib/Sema/SemaLambda.cpp @@ -21,6 +21,7 @@ #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaInternal.h" +#include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/Template.h" #include "llvm/ADT/STLExtras.h" #include @@ -1398,7 +1399,7 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro, // OpenMP lambdas might get assumumption attributes. if (LangOpts.OpenMP) - ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Method); + OpenMP().ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Method); handleLambdaNumbering(Class, Method); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index e9efb4721133fe..d229ef650bccb0 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -11,6 +11,7 @@ /// //===----------------------------------------------------------------------===// +#include "clang/Sema/SemaOpenMP.h" #include "TreeTransform.h" #include "clang/AST/ASTContext.h" #include "clang/AST/ASTMutationListener.h" @@ -33,6 +34,7 @@ #include "clang/Sema/ParsedAttr.h" #include "clang/Sema/Scope.h" #include "clang/Sema/ScopeInfo.h" +#include "clang/Sema/Sema.h" #include "clang/Sema/SemaInternal.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/ADT/PointerEmbeddedInt.h" @@ -1808,9 +1810,9 @@ const DSAStackTy::DSAVarData DSAStackTy::getTopDSA(ValueDecl *D, return DVar; } const_iterator End = end(); - if (!SemaRef.isOpenMPCapturedByRef(D, - std::distance(ParentIterTarget, End), - /*OpenMPCaptureLevel=*/0)) { + if (!SemaRef.OpenMP().isOpenMPCapturedByRef( + D, std::distance(ParentIterTarget, End), + /*OpenMPCaptureLevel=*/0)) { DVar.RefExpr = buildDeclRefExpr(SemaRef, VD, D->getType().getNonReferenceType(), IterTarget->ConstructLoc); @@ -2018,22 +2020,22 @@ bool DSAStackTy::hasDirective( return false; } -void Sema::InitDataSharingAttributesStack() { - VarDataSharingAttributesStack = new DSAStackTy(*this); +void SemaOpenMP::InitDataSharingAttributesStack() { + VarDataSharingAttributesStack = new DSAStackTy(SemaRef); } #define DSAStack static_cast(VarDataSharingAttributesStack) -void Sema::pushOpenMPFunctionRegion() { DSAStack->pushFunction(); } +void SemaOpenMP::pushOpenMPFunctionRegion() { DSAStack->pushFunction(); } -void Sema::popOpenMPFunctionRegion(const FunctionScopeInfo *OldFSI) { +void SemaOpenMP::popOpenMPFunctionRegion(const FunctionScopeInfo *OldFSI) { DSAStack->popFunction(OldFSI); } static bool isOpenMPDeviceDelayedContext(Sema &S) { assert(S.LangOpts.OpenMP && S.LangOpts.OpenMPIsTargetDevice && "Expected OpenMP device compilation."); - return !S.isInOpenMPTargetExecutionDirective(); + return !S.OpenMP().isInOpenMPTargetExecutionDirective(); } namespace { @@ -2045,20 +2047,20 @@ enum class FunctionEmissionStatus { }; } // anonymous namespace -Sema::SemaDiagnosticBuilder -Sema::diagIfOpenMPDeviceCode(SourceLocation Loc, unsigned DiagID, - const FunctionDecl *FD) { - assert(LangOpts.OpenMP && LangOpts.OpenMPIsTargetDevice && +SemaBase::SemaDiagnosticBuilder +SemaOpenMP::diagIfOpenMPDeviceCode(SourceLocation Loc, unsigned DiagID, + const FunctionDecl *FD) { + assert(getLangOpts().OpenMP && getLangOpts().OpenMPIsTargetDevice && "Expected OpenMP device compilation."); SemaDiagnosticBuilder::Kind Kind = SemaDiagnosticBuilder::K_Nop; if (FD) { - FunctionEmissionStatus FES = getEmissionStatus(FD); + Sema::FunctionEmissionStatus FES = SemaRef.getEmissionStatus(FD); switch (FES) { - case FunctionEmissionStatus::Emitted: + case Sema::FunctionEmissionStatus::Emitted: Kind = SemaDiagnosticBuilder::K_Immediate; break; - case FunctionEmissionStatus::Unknown: + case Sema::FunctionEmissionStatus::Unknown: // TODO: We should always delay diagnostics here in case a target // region is in a function we do not emit. However, as the // current diagnostics are associated with the function containing @@ -2066,48 +2068,48 @@ Sema::diagIfOpenMPDeviceCode(SourceLocation Loc, unsigned DiagID, // on diagnostics for the target region itself. We need to anchor // the diagnostics with the new generated function *or* ensure we // emit diagnostics associated with the surrounding function. - Kind = isOpenMPDeviceDelayedContext(*this) + Kind = isOpenMPDeviceDelayedContext(SemaRef) ? SemaDiagnosticBuilder::K_Deferred : SemaDiagnosticBuilder::K_Immediate; break; - case FunctionEmissionStatus::TemplateDiscarded: - case FunctionEmissionStatus::OMPDiscarded: + case Sema::FunctionEmissionStatus::TemplateDiscarded: + case Sema::FunctionEmissionStatus::OMPDiscarded: Kind = SemaDiagnosticBuilder::K_Nop; break; - case FunctionEmissionStatus::CUDADiscarded: + case Sema::FunctionEmissionStatus::CUDADiscarded: llvm_unreachable("CUDADiscarded unexpected in OpenMP device compilation"); break; } } - return SemaDiagnosticBuilder(Kind, Loc, DiagID, FD, *this); + return SemaDiagnosticBuilder(Kind, Loc, DiagID, FD, SemaRef); } -Sema::SemaDiagnosticBuilder Sema::diagIfOpenMPHostCode(SourceLocation Loc, - unsigned DiagID, - const FunctionDecl *FD) { - assert(LangOpts.OpenMP && !LangOpts.OpenMPIsTargetDevice && +SemaBase::SemaDiagnosticBuilder +SemaOpenMP::diagIfOpenMPHostCode(SourceLocation Loc, unsigned DiagID, + const FunctionDecl *FD) { + assert(getLangOpts().OpenMP && !getLangOpts().OpenMPIsTargetDevice && "Expected OpenMP host compilation."); SemaDiagnosticBuilder::Kind Kind = SemaDiagnosticBuilder::K_Nop; if (FD) { - FunctionEmissionStatus FES = getEmissionStatus(FD); + Sema::FunctionEmissionStatus FES = SemaRef.getEmissionStatus(FD); switch (FES) { - case FunctionEmissionStatus::Emitted: + case Sema::FunctionEmissionStatus::Emitted: Kind = SemaDiagnosticBuilder::K_Immediate; break; - case FunctionEmissionStatus::Unknown: + case Sema::FunctionEmissionStatus::Unknown: Kind = SemaDiagnosticBuilder::K_Deferred; break; - case FunctionEmissionStatus::TemplateDiscarded: - case FunctionEmissionStatus::OMPDiscarded: - case FunctionEmissionStatus::CUDADiscarded: + case Sema::FunctionEmissionStatus::TemplateDiscarded: + case Sema::FunctionEmissionStatus::OMPDiscarded: + case Sema::FunctionEmissionStatus::CUDADiscarded: Kind = SemaDiagnosticBuilder::K_Nop; break; } } - return SemaDiagnosticBuilder(Kind, Loc, DiagID, FD, *this); + return SemaDiagnosticBuilder(Kind, Loc, DiagID, FD, SemaRef); } static OpenMPDefaultmapClauseKind @@ -2124,9 +2126,9 @@ getVariableCategoryFromDecl(const LangOptions &LO, const ValueDecl *VD) { return OMPC_DEFAULTMAP_aggregate; } -bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level, - unsigned OpenMPCaptureLevel) const { - assert(LangOpts.OpenMP && "OpenMP is not allowed"); +bool SemaOpenMP::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level, + unsigned OpenMPCaptureLevel) const { + assert(getLangOpts().OpenMP && "OpenMP is not allowed"); ASTContext &Ctx = getASTContext(); bool IsByRef = true; @@ -2252,7 +2254,7 @@ bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level, !Ty->isAnyPointerType()) || !Ty->isScalarType() || DSAStack->isDefaultmapCapturedByRef( - Level, getVariableCategoryFromDecl(LangOpts, D)) || + Level, getVariableCategoryFromDecl(getLangOpts(), D)) || DSAStack->hasExplicitDSA( D, [](OpenMPClauseKind K, bool AppliedToPointee) { @@ -2303,17 +2305,17 @@ bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level, return IsByRef; } -unsigned Sema::getOpenMPNestingLevel() const { +unsigned SemaOpenMP::getOpenMPNestingLevel() const { assert(getLangOpts().OpenMP); return DSAStack->getNestingLevel(); } -bool Sema::isInOpenMPTaskUntiedContext() const { +bool SemaOpenMP::isInOpenMPTaskUntiedContext() const { return isOpenMPTaskingDirective(DSAStack->getCurrentDirective()) && DSAStack->isUntiedRegion(); } -bool Sema::isInOpenMPTargetExecutionDirective() const { +bool SemaOpenMP::isInOpenMPTargetExecutionDirective() const { return (isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()) && !DSAStack->isClauseParsingMode()) || DSAStack->hasDirective( @@ -2324,7 +2326,7 @@ bool Sema::isInOpenMPTargetExecutionDirective() const { false); } -bool Sema::isOpenMPRebuildMemberExpr(ValueDecl *D) { +bool SemaOpenMP::isOpenMPRebuildMemberExpr(ValueDecl *D) { // Only rebuild for Field. if (!dyn_cast(D)) return false; @@ -2347,9 +2349,9 @@ static OMPCapturedExprDecl *buildCaptureDecl(Sema &S, IdentifierInfo *Id, DeclContext *CurContext, bool AsExpression); -VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo, - unsigned StopAt) { - assert(LangOpts.OpenMP && "OpenMP is not allowed"); +VarDecl *SemaOpenMP::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo, + unsigned StopAt) { + assert(getLangOpts().OpenMP && "OpenMP is not allowed"); D = getCanonicalDecl(D); auto *VD = dyn_cast(D); @@ -2368,7 +2370,8 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo, // 'target' we return true so that this global is also mapped to the device. // if (VD && !VD->hasLocalStorage() && - (getCurCapturedRegion() || getCurBlock() || getCurLambda())) { + (SemaRef.getCurCapturedRegion() || SemaRef.getCurBlock() || + SemaRef.getCurLambda())) { if (isInOpenMPTargetExecutionDirective()) { DSAStackTy::DSAVarData DVarTop = DSAStack->getTopDSA(D, DSAStack->isClauseParsingMode()); @@ -2381,8 +2384,9 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo, return nullptr; CapturedRegionScopeInfo *CSI = nullptr; for (FunctionScopeInfo *FSI : llvm::drop_begin( - llvm::reverse(FunctionScopes), - CheckScopeInfo ? (FunctionScopes.size() - (StopAt + 1)) : 0)) { + llvm::reverse(SemaRef.FunctionScopes), + CheckScopeInfo ? (SemaRef.FunctionScopes.size() - (StopAt + 1)) + : 0)) { if (!isa(FSI)) return nullptr; if (auto *RSI = dyn_cast(FSI)) @@ -2401,7 +2405,7 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo, if (isInOpenMPDeclareTargetContext()) { // Try to mark variable as declare target if it is used in capturing // regions. - if (LangOpts.OpenMP <= 45 && + if (getLangOpts().OpenMP <= 45 && !OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) checkDeclIsAllowedInOpenMPTarget(nullptr, VD); return nullptr; @@ -2411,7 +2415,7 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo, if (CheckScopeInfo) { bool OpenMPFound = false; for (unsigned I = StopAt + 1; I > 0; --I) { - FunctionScopeInfo *FSI = FunctionScopes[I - 1]; + FunctionScopeInfo *FSI = SemaRef.FunctionScopes[I - 1]; if (!isa(FSI)) return nullptr; if (auto *RSI = dyn_cast(FSI)) @@ -2476,22 +2480,23 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo, VarDecl *VD = DSAStack->getImplicitFDCapExprDecl(FD); if (VD) return VD; - if (getCurrentThisType().isNull()) + if (SemaRef.getCurrentThisType().isNull()) return nullptr; - Expr *ThisExpr = BuildCXXThisExpr(SourceLocation(), getCurrentThisType(), - /*IsImplicit=*/true); + Expr *ThisExpr = SemaRef.BuildCXXThisExpr(SourceLocation(), + SemaRef.getCurrentThisType(), + /*IsImplicit=*/true); const CXXScopeSpec CS = CXXScopeSpec(); - Expr *ME = BuildMemberExpr(ThisExpr, /*IsArrow=*/true, SourceLocation(), - NestedNameSpecifierLoc(), SourceLocation(), FD, - DeclAccessPair::make(FD, FD->getAccess()), - /*HadMultipleCandidates=*/false, - DeclarationNameInfo(), FD->getType(), - VK_LValue, OK_Ordinary); + Expr *ME = SemaRef.BuildMemberExpr( + ThisExpr, /*IsArrow=*/true, SourceLocation(), + NestedNameSpecifierLoc(), SourceLocation(), FD, + DeclAccessPair::make(FD, FD->getAccess()), + /*HadMultipleCandidates=*/false, DeclarationNameInfo(), FD->getType(), + VK_LValue, OK_Ordinary); OMPCapturedExprDecl *CD = buildCaptureDecl( - *this, FD->getIdentifier(), ME, DVarPrivate.CKind != OMPC_private, - CurContext->getParent(), /*AsExpression=*/false); + SemaRef, FD->getIdentifier(), ME, DVarPrivate.CKind != OMPC_private, + SemaRef.CurContext->getParent(), /*AsExpression=*/false); DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr( - *this, CD, CD->getType().getNonReferenceType(), SourceLocation()); + SemaRef, CD, CD->getType().getNonReferenceType(), SourceLocation()); VD = cast(VDPrivateRefExpr->getDecl()); DSAStack->addImplicitDefaultFirstprivateFD(FD, VD); return VD; @@ -2505,28 +2510,28 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo, return nullptr; } -void Sema::adjustOpenMPTargetScopeIndex(unsigned &FunctionScopesIndex, - unsigned Level) const { +void SemaOpenMP::adjustOpenMPTargetScopeIndex(unsigned &FunctionScopesIndex, + unsigned Level) const { FunctionScopesIndex -= getOpenMPCaptureLevels(DSAStack->getDirective(Level)); } -void Sema::startOpenMPLoop() { - assert(LangOpts.OpenMP && "OpenMP must be enabled."); +void SemaOpenMP::startOpenMPLoop() { + assert(getLangOpts().OpenMP && "OpenMP must be enabled."); if (isOpenMPLoopDirective(DSAStack->getCurrentDirective())) DSAStack->loopInit(); } -void Sema::startOpenMPCXXRangeFor() { - assert(LangOpts.OpenMP && "OpenMP must be enabled."); +void SemaOpenMP::startOpenMPCXXRangeFor() { + assert(getLangOpts().OpenMP && "OpenMP must be enabled."); if (isOpenMPLoopDirective(DSAStack->getCurrentDirective())) { DSAStack->resetPossibleLoopCounter(); DSAStack->loopStart(); } } -OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level, - unsigned CapLevel) const { - assert(LangOpts.OpenMP && "OpenMP is not allowed"); +OpenMPClauseKind SemaOpenMP::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level, + unsigned CapLevel) const { + assert(getLangOpts().OpenMP && "OpenMP is not allowed"); if (DSAStack->getCurrentDirective() != OMPD_unknown && (!DSAStack->isClauseParsingMode() || DSAStack->getParentDirective() != OMPD_unknown)) { @@ -2546,7 +2551,8 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level, } if (DSAStack->hasExplicitDirective(isOpenMPTaskingDirective, Level)) { bool IsTriviallyCopyable = - D->getType().getNonReferenceType().isTriviallyCopyableType(Context) && + D->getType().getNonReferenceType().isTriviallyCopyableType( + getASTContext()) && !D->getType() .getNonReferenceType() .getCanonicalType() @@ -2620,9 +2626,9 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level, : OMPC_unknown; } -void Sema::setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D, - unsigned Level) { - assert(LangOpts.OpenMP && "OpenMP is not allowed"); +void SemaOpenMP::setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D, + unsigned Level) { + assert(getLangOpts().OpenMP && "OpenMP is not allowed"); D = getCanonicalDecl(D); OpenMPClauseKind OMPC = OMPC_unknown; for (unsigned I = DSAStack->getNestingLevel() + 1; I > Level; --I) { @@ -2649,18 +2655,19 @@ void Sema::setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D, NewLevel)) { OMPC = OMPC_map; if (DSAStack->mustBeFirstprivateAtLevel( - NewLevel, getVariableCategoryFromDecl(LangOpts, D))) + NewLevel, getVariableCategoryFromDecl(getLangOpts(), D))) OMPC = OMPC_firstprivate; break; } } if (OMPC != OMPC_unknown) - FD->addAttr(OMPCaptureKindAttr::CreateImplicit(Context, unsigned(OMPC))); + FD->addAttr( + OMPCaptureKindAttr::CreateImplicit(getASTContext(), unsigned(OMPC))); } -bool Sema::isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level, - unsigned CaptureLevel) const { - assert(LangOpts.OpenMP && "OpenMP is not allowed"); +bool SemaOpenMP::isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level, + unsigned CaptureLevel) const { + assert(getLangOpts().OpenMP && "OpenMP is not allowed"); // Return true if the current level is no longer enclosed in a target region. SmallVector Regions; @@ -2672,9 +2679,9 @@ bool Sema::isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level, Regions[CaptureLevel] != OMPD_task; } -bool Sema::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level, - unsigned CaptureLevel) const { - assert(LangOpts.OpenMP && "OpenMP is not allowed"); +bool SemaOpenMP::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level, + unsigned CaptureLevel) const { + assert(getLangOpts().OpenMP && "OpenMP is not allowed"); // Return true if the current level is no longer enclosed in a target region. if (const auto *VD = dyn_cast(D)) { @@ -2702,37 +2709,37 @@ bool Sema::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level, return true; } -void Sema::DestroyDataSharingAttributesStack() { delete DSAStack; } +void SemaOpenMP::DestroyDataSharingAttributesStack() { delete DSAStack; } -void Sema::ActOnOpenMPBeginDeclareVariant(SourceLocation Loc, - OMPTraitInfo &TI) { +void SemaOpenMP::ActOnOpenMPBeginDeclareVariant(SourceLocation Loc, + OMPTraitInfo &TI) { OMPDeclareVariantScopes.push_back(OMPDeclareVariantScope(TI)); } -void Sema::ActOnOpenMPEndDeclareVariant() { +void SemaOpenMP::ActOnOpenMPEndDeclareVariant() { assert(isInOpenMPDeclareVariantScope() && "Not in OpenMP declare variant scope!"); OMPDeclareVariantScopes.pop_back(); } -void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller, - const FunctionDecl *Callee, - SourceLocation Loc) { - assert(LangOpts.OpenMP && "Expected OpenMP compilation mode."); +void SemaOpenMP::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller, + const FunctionDecl *Callee, + SourceLocation Loc) { + assert(getLangOpts().OpenMP && "Expected OpenMP compilation mode."); std::optional DevTy = OMPDeclareTargetDeclAttr::getDeviceType(Caller->getMostRecentDecl()); // Ignore host functions during device analyzis. - if (LangOpts.OpenMPIsTargetDevice && + if (getLangOpts().OpenMPIsTargetDevice && (!DevTy || *DevTy == OMPDeclareTargetDeclAttr::DT_Host)) return; // Ignore nohost functions during host analyzis. - if (!LangOpts.OpenMPIsTargetDevice && DevTy && + if (!getLangOpts().OpenMPIsTargetDevice && DevTy && *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) return; const FunctionDecl *FD = Callee->getMostRecentDecl(); DevTy = OMPDeclareTargetDeclAttr::getDeviceType(FD); - if (LangOpts.OpenMPIsTargetDevice && DevTy && + if (getLangOpts().OpenMPIsTargetDevice && DevTy && *DevTy == OMPDeclareTargetDeclAttr::DT_Host) { // Diagnose host function called during device codegen. StringRef HostDevTy = @@ -2743,8 +2750,9 @@ void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller, << HostDevTy; return; } - if (!LangOpts.OpenMPIsTargetDevice && !LangOpts.OpenMPOffloadMandatory && - DevTy && *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) { + if (!getLangOpts().OpenMPIsTargetDevice && + !getLangOpts().OpenMPOffloadMandatory && DevTy && + *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) { // In OpenMP 5.2 or later, if the function has a host variant then allow // that to be called instead auto &&HasHostAttr = [](const FunctionDecl *Callee) { @@ -2773,21 +2781,21 @@ void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller, } } -void Sema::StartOpenMPDSABlock(OpenMPDirectiveKind DKind, - const DeclarationNameInfo &DirName, - Scope *CurScope, SourceLocation Loc) { +void SemaOpenMP::StartOpenMPDSABlock(OpenMPDirectiveKind DKind, + const DeclarationNameInfo &DirName, + Scope *CurScope, SourceLocation Loc) { DSAStack->push(DKind, DirName, CurScope, Loc); - PushExpressionEvaluationContext( - ExpressionEvaluationContext::PotentiallyEvaluated); + SemaRef.PushExpressionEvaluationContext( + Sema::ExpressionEvaluationContext::PotentiallyEvaluated); } -void Sema::StartOpenMPClause(OpenMPClauseKind K) { +void SemaOpenMP::StartOpenMPClause(OpenMPClauseKind K) { DSAStack->setClauseParsingMode(K); } -void Sema::EndOpenMPClause() { +void SemaOpenMP::EndOpenMPClause() { DSAStack->setClauseParsingMode(/*K=*/OMPC_unknown); - CleanupVarDeclMarking(); + SemaRef.CleanupVarDeclMarking(); } static std::pair @@ -2871,7 +2879,7 @@ static void reportOriginalDsa(Sema &SemaRef, const DSAStackTy *Stack, const DSAStackTy::DSAVarData &DVar, bool IsLoopIterVar = false); -void Sema::EndOpenMPDSABlock(Stmt *CurDirective) { +void SemaOpenMP::EndOpenMPDSABlock(Stmt *CurDirective) { // OpenMP [2.14.3.5, Restrictions, C/C++, p.1] // A variable of class type (or array thereof) that appears in a lastprivate // clause requires an accessible, unambiguous default constructor for the @@ -2898,15 +2906,15 @@ void Sema::EndOpenMPDSABlock(Stmt *CurDirective) { // variable is not added to IdResolver, so the code in the OpenMP // region uses original variable for proper diagnostics. VarDecl *VDPrivate = buildVarDecl( - *this, DE->getExprLoc(), Type.getUnqualifiedType(), + SemaRef, DE->getExprLoc(), Type.getUnqualifiedType(), VD->getName(), VD->hasAttrs() ? &VD->getAttrs() : nullptr, DRE); - ActOnUninitializedDecl(VDPrivate); + SemaRef.ActOnUninitializedDecl(VDPrivate); if (VDPrivate->isInvalidDecl()) { PrivateCopies.push_back(nullptr); continue; } PrivateCopies.push_back(buildDeclRefExpr( - *this, VDPrivate, DE->getType(), DE->getExprLoc())); + SemaRef, VDPrivate, DE->getType(), DE->getExprLoc())); } else { // The variable is also a firstprivate, so initialization sequence // for private copy is generated already. @@ -2924,7 +2932,7 @@ void Sema::EndOpenMPDSABlock(Stmt *CurDirective) { SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) // It will be analyzed later. PrivateRefs.push_back(RefExpr); @@ -2977,7 +2985,7 @@ void Sema::EndOpenMPDSABlock(Stmt *CurDirective) { diag::err_omp_allocator_used_in_clauses) << D.Allocator->getSourceRange(); if (DVar.RefExpr) - reportOriginalDsa(*this, DSAStack, VD, DVar); + reportOriginalDsa(SemaRef, DSAStack, VD, DVar); else Diag(MapExpr->getExprLoc(), diag::note_used_here) << MapExpr->getSourceRange(); @@ -2987,14 +2995,14 @@ void Sema::EndOpenMPDSABlock(Stmt *CurDirective) { } } // Check allocate clauses. - if (!CurContext->isDependentContext()) - checkAllocateClauses(*this, DSAStack, D->clauses()); - checkReductionClauses(*this, DSAStack, D->clauses()); + if (!SemaRef.CurContext->isDependentContext()) + checkAllocateClauses(SemaRef, DSAStack, D->clauses()); + checkReductionClauses(SemaRef, DSAStack, D->clauses()); } DSAStack->pop(); - DiscardCleanupsInEvaluationContext(); - PopExpressionEvaluationContext(); + SemaRef.DiscardCleanupsInEvaluationContext(); + SemaRef.PopExpressionEvaluationContext(); } static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV, @@ -3047,27 +3055,28 @@ class VarOrFuncDeclFilterCCC final : public CorrectionCandidateCallback { } // namespace -ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope, - CXXScopeSpec &ScopeSpec, - const DeclarationNameInfo &Id, - OpenMPDirectiveKind Kind) { - LookupResult Lookup(*this, Id, LookupOrdinaryName); - LookupParsedName(Lookup, CurScope, &ScopeSpec, true); +ExprResult SemaOpenMP::ActOnOpenMPIdExpression(Scope *CurScope, + CXXScopeSpec &ScopeSpec, + const DeclarationNameInfo &Id, + OpenMPDirectiveKind Kind) { + ASTContext &Context = getASTContext(); + LookupResult Lookup(SemaRef, Id, Sema::LookupOrdinaryName); + SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec, true); if (Lookup.isAmbiguous()) return ExprError(); VarDecl *VD; if (!Lookup.isSingleResult()) { - VarDeclFilterCCC CCC(*this); + VarDeclFilterCCC CCC(SemaRef); if (TypoCorrection Corrected = - CorrectTypo(Id, LookupOrdinaryName, CurScope, nullptr, CCC, - CTK_ErrorRecovery)) { - diagnoseTypo(Corrected, - PDiag(Lookup.empty() - ? diag::err_undeclared_var_use_suggest - : diag::err_omp_expected_var_arg_suggest) - << Id.getName()); + SemaRef.CorrectTypo(Id, Sema::LookupOrdinaryName, CurScope, nullptr, + CCC, Sema::CTK_ErrorRecovery)) { + SemaRef.diagnoseTypo( + Corrected, + SemaRef.PDiag(Lookup.empty() ? diag::err_undeclared_var_use_suggest + : diag::err_omp_expected_var_arg_suggest) + << Id.getName()); VD = Corrected.getCorrectionDeclAs(); } else { Diag(Id.getLoc(), Lookup.empty() ? diag::err_undeclared_var_use @@ -3101,7 +3110,7 @@ ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope, // A threadprivate directive for file-scope variables must appear outside // any definition or declaration. if (CanonicalVD->getDeclContext()->isTranslationUnit() && - !getCurLexicalContext()->isTranslationUnit()) { + !SemaRef.getCurLexicalContext()->isTranslationUnit()) { Diag(Id.getLoc(), diag::err_omp_var_scope) << getOpenMPDirectiveName(Kind) << VD; bool IsDecl = @@ -3116,7 +3125,7 @@ ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope, // in the class definition, in the same scope in which the member // variables are declared. if (CanonicalVD->isStaticDataMember() && - !CanonicalVD->getDeclContext()->Equals(getCurLexicalContext())) { + !CanonicalVD->getDeclContext()->Equals(SemaRef.getCurLexicalContext())) { Diag(Id.getLoc(), diag::err_omp_var_scope) << getOpenMPDirectiveName(Kind) << VD; bool IsDecl = @@ -3131,8 +3140,9 @@ ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope, // outside any definition or declaration other than the namespace // definition itself. if (CanonicalVD->getDeclContext()->isNamespace() && - (!getCurLexicalContext()->isFileContext() || - !getCurLexicalContext()->Encloses(CanonicalVD->getDeclContext()))) { + (!SemaRef.getCurLexicalContext()->isFileContext() || + !SemaRef.getCurLexicalContext()->Encloses( + CanonicalVD->getDeclContext()))) { Diag(Id.getLoc(), diag::err_omp_var_scope) << getOpenMPDirectiveName(Kind) << VD; bool IsDecl = @@ -3146,7 +3156,7 @@ ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope, // A threadprivate directive for static block-scope variables must appear // in the scope of the variable and not in a nested scope. if (CanonicalVD->isLocalVarDecl() && CurScope && - !isDeclInScope(ND, getCurLexicalContext(), CurScope)) { + !SemaRef.isDeclInScope(ND, SemaRef.getCurLexicalContext(), CurScope)) { Diag(Id.getLoc(), diag::err_omp_var_scope) << getOpenMPDirectiveName(Kind) << VD; bool IsDecl = @@ -3174,11 +3184,11 @@ ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope, Id.getLoc(), ExprType, VK_LValue); } -Sema::DeclGroupPtrTy -Sema::ActOnOpenMPThreadprivateDirective(SourceLocation Loc, - ArrayRef VarList) { +SemaOpenMP::DeclGroupPtrTy +SemaOpenMP::ActOnOpenMPThreadprivateDirective(SourceLocation Loc, + ArrayRef VarList) { if (OMPThreadPrivateDecl *D = CheckOMPThreadPrivateDecl(Loc, VarList)) { - CurContext->addDecl(D); + SemaRef.CurContext->addDecl(D); return DeclGroupPtrTy::make(DeclGroupRef(D)); } return nullptr; @@ -3215,7 +3225,9 @@ class LocalVarRefChecker final } // namespace OMPThreadPrivateDecl * -Sema::CheckOMPThreadPrivateDecl(SourceLocation Loc, ArrayRef VarList) { +SemaOpenMP::CheckOMPThreadPrivateDecl(SourceLocation Loc, + ArrayRef VarList) { + ASTContext &Context = getASTContext(); SmallVector Vars; for (Expr *RefExpr : VarList) { auto *DE = cast(RefExpr); @@ -3235,8 +3247,8 @@ Sema::CheckOMPThreadPrivateDecl(SourceLocation Loc, ArrayRef VarList) { // OpenMP [2.9.2, Restrictions, C/C++, p.10] // A threadprivate variable must not have an incomplete type. - if (RequireCompleteType(ILoc, VD->getType(), - diag::err_omp_threadprivate_incomplete_type)) { + if (SemaRef.RequireCompleteType( + ILoc, VD->getType(), diag::err_omp_threadprivate_incomplete_type)) { continue; } @@ -3274,7 +3286,7 @@ Sema::CheckOMPThreadPrivateDecl(SourceLocation Loc, ArrayRef VarList) { // Check if initial value of threadprivate variable reference variable with // local storage (it is not supported by runtime). if (const Expr *Init = VD->getAnyInitializer()) { - LocalVarRefChecker Checker(*this); + LocalVarRefChecker Checker(SemaRef); if (Checker.Visit(Init)) continue; } @@ -3288,8 +3300,8 @@ Sema::CheckOMPThreadPrivateDecl(SourceLocation Loc, ArrayRef VarList) { } OMPThreadPrivateDecl *D = nullptr; if (!Vars.empty()) { - D = OMPThreadPrivateDecl::Create(Context, getCurLexicalContext(), Loc, - Vars); + D = OMPThreadPrivateDecl::Create(Context, SemaRef.getCurLexicalContext(), + Loc, Vars); D->setAccess(AS_public); } return D; @@ -3395,10 +3407,9 @@ applyOMPAllocateAttribute(Sema &S, VarDecl *VD, ML->DeclarationMarkedOpenMPAllocate(VD, A); } -Sema::DeclGroupPtrTy -Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef VarList, - ArrayRef Clauses, - DeclContext *Owner) { +SemaOpenMP::DeclGroupPtrTy SemaOpenMP::ActOnOpenMPAllocateDirective( + SourceLocation Loc, ArrayRef VarList, ArrayRef Clauses, + DeclContext *Owner) { assert(Clauses.size() <= 2 && "Expected at most two clauses."); Expr *Alignment = nullptr; Expr *Allocator = nullptr; @@ -3407,9 +3418,9 @@ Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef VarList, // allocate directives that appear in a target region must specify an // allocator clause unless a requires directive with the dynamic_allocators // clause is present in the same compilation unit. - if (LangOpts.OpenMPIsTargetDevice && + if (getLangOpts().OpenMPIsTargetDevice && !DSAStack->hasRequiresDeclWithClause()) - targetDiag(Loc, diag::err_expected_allocator_clause); + SemaRef.targetDiag(Loc, diag::err_expected_allocator_clause); } else { for (const OMPClause *C : Clauses) if (const auto *AC = dyn_cast(C)) @@ -3420,7 +3431,7 @@ Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef VarList, llvm_unreachable("Unexpected clause on allocate directive"); } OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind = - getAllocatorKind(*this, DSAStack, Allocator); + getAllocatorKind(SemaRef, DSAStack, Allocator); SmallVector Vars; for (Expr *RefExpr : VarList) { auto *DE = cast(RefExpr); @@ -3435,7 +3446,7 @@ Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef VarList, // If the used several times in the allocate directive, the same allocator // must be used. - if (checkPreviousOMPAllocateAttribute(*this, DSAStack, RefExpr, VD, + if (checkPreviousOMPAllocateAttribute(SemaRef, DSAStack, RefExpr, VD, AllocatorKind, Allocator)) continue; @@ -3448,7 +3459,7 @@ Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef VarList, Diag(Allocator->getExprLoc(), diag::err_omp_expected_predefined_allocator) << Allocator->getSourceRange(); - bool IsDecl = VD->isThisDeclarationADefinition(Context) == + bool IsDecl = VD->isThisDeclarationADefinition(getASTContext()) == VarDecl::DeclarationOnly; Diag(VD->getLocation(), IsDecl ? diag::note_previous_decl : diag::note_defined_here) @@ -3458,45 +3469,46 @@ Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef VarList, } Vars.push_back(RefExpr); - applyOMPAllocateAttribute(*this, VD, AllocatorKind, Allocator, Alignment, + applyOMPAllocateAttribute(SemaRef, VD, AllocatorKind, Allocator, Alignment, DE->getSourceRange()); } if (Vars.empty()) return nullptr; if (!Owner) - Owner = getCurLexicalContext(); - auto *D = OMPAllocateDecl::Create(Context, Owner, Loc, Vars, Clauses); + Owner = SemaRef.getCurLexicalContext(); + auto *D = OMPAllocateDecl::Create(getASTContext(), Owner, Loc, Vars, Clauses); D->setAccess(AS_public); Owner->addDecl(D); return DeclGroupPtrTy::make(DeclGroupRef(D)); } -Sema::DeclGroupPtrTy -Sema::ActOnOpenMPRequiresDirective(SourceLocation Loc, - ArrayRef ClauseList) { +SemaOpenMP::DeclGroupPtrTy +SemaOpenMP::ActOnOpenMPRequiresDirective(SourceLocation Loc, + ArrayRef ClauseList) { OMPRequiresDecl *D = nullptr; - if (!CurContext->isFileContext()) { + if (!SemaRef.CurContext->isFileContext()) { Diag(Loc, diag::err_omp_invalid_scope) << "requires"; } else { D = CheckOMPRequiresDecl(Loc, ClauseList); if (D) { - CurContext->addDecl(D); + SemaRef.CurContext->addDecl(D); DSAStack->addRequiresDecl(D); } } return DeclGroupPtrTy::make(DeclGroupRef(D)); } -void Sema::ActOnOpenMPAssumesDirective(SourceLocation Loc, - OpenMPDirectiveKind DKind, - ArrayRef Assumptions, - bool SkippedClauses) { +void SemaOpenMP::ActOnOpenMPAssumesDirective(SourceLocation Loc, + OpenMPDirectiveKind DKind, + ArrayRef Assumptions, + bool SkippedClauses) { if (!SkippedClauses && Assumptions.empty()) Diag(Loc, diag::err_omp_no_clause_for_directive) << llvm::omp::getAllAssumeClauseOptions() << llvm::omp::getOpenMPDirectiveName(DKind); - auto *AA = OMPAssumeAttr::Create(Context, llvm::join(Assumptions, ","), Loc); + auto *AA = + OMPAssumeAttr::Create(getASTContext(), llvm::join(Assumptions, ","), Loc); if (DKind == llvm::omp::Directive::OMPD_begin_assumes) { OMPAssumeScoped.push_back(AA); return; @@ -3515,7 +3527,7 @@ void Sema::ActOnOpenMPAssumesDirective(SourceLocation Loc, // declarations in included headers. To this end, we traverse all existing // declaration contexts and annotate function declarations here. SmallVector DeclContexts; - auto *Ctx = CurContext; + auto *Ctx = SemaRef.CurContext; while (Ctx->getLexicalParent()) Ctx = Ctx->getLexicalParent(); DeclContexts.push_back(Ctx); @@ -3539,13 +3551,14 @@ void Sema::ActOnOpenMPAssumesDirective(SourceLocation Loc, } } -void Sema::ActOnOpenMPEndAssumesDirective() { +void SemaOpenMP::ActOnOpenMPEndAssumesDirective() { assert(isInOpenMPAssumeScope() && "Not in OpenMP assumes scope!"); OMPAssumeScoped.pop_back(); } -OMPRequiresDecl *Sema::CheckOMPRequiresDecl(SourceLocation Loc, - ArrayRef ClauseList) { +OMPRequiresDecl * +SemaOpenMP::CheckOMPRequiresDecl(SourceLocation Loc, + ArrayRef ClauseList) { /// For target specific clauses, the requires directive cannot be /// specified after the handling of any of the target regions in the /// current compilation unit. @@ -3576,8 +3589,8 @@ OMPRequiresDecl *Sema::CheckOMPRequiresDecl(SourceLocation Loc, } if (!DSAStack->hasDuplicateRequiresClause(ClauseList)) - return OMPRequiresDecl::Create(Context, getCurLexicalContext(), Loc, - ClauseList); + return OMPRequiresDecl::Create( + getASTContext(), SemaRef.getCurLexicalContext(), Loc, ClauseList); return nullptr; } @@ -3695,7 +3708,7 @@ class DSAAttrChecker final : public StmtVisitor { llvm::SmallVector ImplicitMap[DefaultmapKindNum][OMPC_MAP_delete]; llvm::SmallVector ImplicitMapModifier[DefaultmapKindNum]; - Sema::VarsWithInheritedDSAType VarsWithInheritedDSA; + SemaOpenMP::VarsWithInheritedDSAType VarsWithInheritedDSA; llvm::SmallDenseSet ImplicitDeclarations; void VisitSubCaptures(OMPExecutableDirective *S) { @@ -4161,7 +4174,7 @@ class DSAAttrChecker final : public StmtVisitor { getImplicitMapModifier(OpenMPDefaultmapClauseKind Kind) const { return ImplicitMapModifier[Kind]; } - const Sema::VarsWithInheritedDSAType &getVarsWithInheritedDSA() const { + const SemaOpenMP::VarsWithInheritedDSAType &getVarsWithInheritedDSA() const { return VarsWithInheritedDSA; } @@ -4193,7 +4206,9 @@ static void handleDeclareVariantConstructTrait(DSAStackTy *Stack, Stack->handleConstructTrait(Traits, ScopeEntry); } -void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { +void SemaOpenMP::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, + Scope *CurScope) { + ASTContext &Context = getASTContext(); switch (DKind) { case OMPD_parallel: case OMPD_parallel_for: @@ -4208,13 +4223,13 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst(); QualType KmpInt32PtrTy = Context.getPointerType(KmpInt32Ty).withConst().withRestrict(); - Sema::CapturedParamNameType Params[] = { + SemaOpenMP::CapturedParamNameType Params[] = { std::make_pair(".global_tid.", KmpInt32PtrTy), std::make_pair(".bound_tid.", KmpInt32PtrTy), std::make_pair(StringRef(), QualType()) // __context with shared vars }; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - Params); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, Params); break; } case OMPD_target_teams: @@ -4232,7 +4247,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { FunctionProtoType::ExtProtoInfo EPI; EPI.Variadic = true; QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI); - Sema::CapturedParamNameType Params[] = { + SemaOpenMP::CapturedParamNameType Params[] = { std::make_pair(".global_tid.", KmpInt32Ty), std::make_pair(".part_id.", KmpInt32PtrTy), std::make_pair(".privates.", VoidPtrTy), @@ -4242,31 +4257,33 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { std::make_pair(".task_t.", Context.VoidPtrTy.withConst()), std::make_pair(StringRef(), QualType()) // __context with shared vars }; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - Params, /*OpenMPCaptureLevel=*/0); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, Params, + /*OpenMPCaptureLevel=*/0); // Mark this captured region as inlined, because we don't use outlined // function directly. - getCurCapturedRegion()->TheCapturedDecl->addAttr( + SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr( AlwaysInlineAttr::CreateImplicit( Context, {}, AlwaysInlineAttr::Keyword_forceinline)); - SmallVector ParamsTarget; + SmallVector ParamsTarget; if (getLangOpts().OpenMPIsTargetDevice) ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy)); ParamsTarget.push_back( std::make_pair(StringRef(), QualType())); // __context with shared vars; // Start a captured region for 'target' with no implicit parameters. - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - ParamsTarget, - /*OpenMPCaptureLevel=*/1); - Sema::CapturedParamNameType ParamsTeamsOrParallel[] = { + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, ParamsTarget, + /*OpenMPCaptureLevel=*/1); + SemaOpenMP::CapturedParamNameType ParamsTeamsOrParallel[] = { std::make_pair(".global_tid.", KmpInt32PtrTy), std::make_pair(".bound_tid.", KmpInt32PtrTy), std::make_pair(StringRef(), QualType()) // __context with shared vars }; // Start a captured region for 'teams' or 'parallel'. Both regions have // the same implicit parameters. - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - ParamsTeamsOrParallel, /*OpenMPCaptureLevel=*/2); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, ParamsTeamsOrParallel, + /*OpenMPCaptureLevel=*/2); break; } case OMPD_target: @@ -4279,7 +4296,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { FunctionProtoType::ExtProtoInfo EPI; EPI.Variadic = true; QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI); - Sema::CapturedParamNameType Params[] = { + SemaOpenMP::CapturedParamNameType Params[] = { std::make_pair(".global_tid.", KmpInt32Ty), std::make_pair(".part_id.", KmpInt32PtrTy), std::make_pair(".privates.", VoidPtrTy), @@ -4289,21 +4306,22 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { std::make_pair(".task_t.", Context.VoidPtrTy.withConst()), std::make_pair(StringRef(), QualType()) // __context with shared vars }; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - Params, /*OpenMPCaptureLevel=*/0); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, Params, + /*OpenMPCaptureLevel=*/0); // Mark this captured region as inlined, because we don't use outlined // function directly. - getCurCapturedRegion()->TheCapturedDecl->addAttr( + SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr( AlwaysInlineAttr::CreateImplicit( Context, {}, AlwaysInlineAttr::Keyword_forceinline)); - SmallVector ParamsTarget; + SmallVector ParamsTarget; if (getLangOpts().OpenMPIsTargetDevice) ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy)); ParamsTarget.push_back( std::make_pair(StringRef(), QualType())); // __context with shared vars; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - ParamsTarget, - /*OpenMPCaptureLevel=*/1); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, ParamsTarget, + /*OpenMPCaptureLevel=*/1); break; } case OMPD_atomic: @@ -4329,11 +4347,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { case OMPD_scope: case OMPD_target_data: case OMPD_dispatch: { - Sema::CapturedParamNameType Params[] = { + SemaOpenMP::CapturedParamNameType Params[] = { std::make_pair(StringRef(), QualType()) // __context with shared vars }; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - Params); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, Params); break; } case OMPD_task: { @@ -4345,7 +4363,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { FunctionProtoType::ExtProtoInfo EPI; EPI.Variadic = true; QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI); - Sema::CapturedParamNameType Params[] = { + SemaOpenMP::CapturedParamNameType Params[] = { std::make_pair(".global_tid.", KmpInt32Ty), std::make_pair(".part_id.", KmpInt32PtrTy), std::make_pair(".privates.", VoidPtrTy), @@ -4355,11 +4373,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { std::make_pair(".task_t.", Context.VoidPtrTy.withConst()), std::make_pair(StringRef(), QualType()) // __context with shared vars }; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - Params); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, Params); // Mark this captured region as inlined, because we don't use outlined // function directly. - getCurCapturedRegion()->TheCapturedDecl->addAttr( + SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr( AlwaysInlineAttr::CreateImplicit( Context, {}, AlwaysInlineAttr::Keyword_forceinline)); break; @@ -4386,7 +4404,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { FunctionProtoType::ExtProtoInfo EPI; EPI.Variadic = true; QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI); - Sema::CapturedParamNameType Params[] = { + SemaOpenMP::CapturedParamNameType Params[] = { std::make_pair(".global_tid.", KmpInt32Ty), std::make_pair(".part_id.", KmpInt32PtrTy), std::make_pair(".privates.", VoidPtrTy), @@ -4401,11 +4419,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { std::make_pair(".reductions.", VoidPtrTy), std::make_pair(StringRef(), QualType()) // __context with shared vars }; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - Params); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, Params); // Mark this captured region as inlined, because we don't use outlined // function directly. - getCurCapturedRegion()->TheCapturedDecl->addAttr( + SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr( AlwaysInlineAttr::CreateImplicit( Context, {}, AlwaysInlineAttr::Keyword_forceinline)); break; @@ -4426,19 +4444,20 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict(); QualType KmpInt32PtrTy = Context.getPointerType(KmpInt32Ty).withConst().withRestrict(); - Sema::CapturedParamNameType ParamsParallel[] = { + SemaOpenMP::CapturedParamNameType ParamsParallel[] = { std::make_pair(".global_tid.", KmpInt32PtrTy), std::make_pair(".bound_tid.", KmpInt32PtrTy), std::make_pair(StringRef(), QualType()) // __context with shared vars }; // Start a captured region for 'parallel'. - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - ParamsParallel, /*OpenMPCaptureLevel=*/0); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, ParamsParallel, + /*OpenMPCaptureLevel=*/0); QualType Args[] = {VoidPtrTy}; FunctionProtoType::ExtProtoInfo EPI; EPI.Variadic = true; QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI); - Sema::CapturedParamNameType Params[] = { + SemaOpenMP::CapturedParamNameType Params[] = { std::make_pair(".global_tid.", KmpInt32Ty), std::make_pair(".part_id.", KmpInt32PtrTy), std::make_pair(".privates.", VoidPtrTy), @@ -4453,11 +4472,12 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { std::make_pair(".reductions.", VoidPtrTy), std::make_pair(StringRef(), QualType()) // __context with shared vars }; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - Params, /*OpenMPCaptureLevel=*/1); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, Params, + /*OpenMPCaptureLevel=*/1); // Mark this captured region as inlined, because we don't use outlined // function directly. - getCurCapturedRegion()->TheCapturedDecl->addAttr( + SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr( AlwaysInlineAttr::CreateImplicit( Context, {}, AlwaysInlineAttr::Keyword_forceinline)); break; @@ -4467,15 +4487,15 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst(); QualType KmpInt32PtrTy = Context.getPointerType(KmpInt32Ty).withConst().withRestrict(); - Sema::CapturedParamNameType Params[] = { + SemaOpenMP::CapturedParamNameType Params[] = { std::make_pair(".global_tid.", KmpInt32PtrTy), std::make_pair(".bound_tid.", KmpInt32PtrTy), std::make_pair(".previous.lb.", Context.getSizeType().withConst()), std::make_pair(".previous.ub.", Context.getSizeType().withConst()), std::make_pair(StringRef(), QualType()) // __context with shared vars }; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - Params); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, Params); break; } // For 'target teams loop', collect all captured regions so codegen can @@ -4492,7 +4512,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { FunctionProtoType::ExtProtoInfo EPI; EPI.Variadic = true; QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI); - Sema::CapturedParamNameType Params[] = { + SemaOpenMP::CapturedParamNameType Params[] = { std::make_pair(".global_tid.", KmpInt32Ty), std::make_pair(".part_id.", KmpInt32PtrTy), std::make_pair(".privates.", VoidPtrTy), @@ -4502,32 +4522,35 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { std::make_pair(".task_t.", Context.VoidPtrTy.withConst()), std::make_pair(StringRef(), QualType()) // __context with shared vars }; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - Params, /*OpenMPCaptureLevel=*/0); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, Params, + /*OpenMPCaptureLevel=*/0); // Mark this captured region as inlined, because we don't use outlined // function directly. - getCurCapturedRegion()->TheCapturedDecl->addAttr( + SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr( AlwaysInlineAttr::CreateImplicit( Context, {}, AlwaysInlineAttr::Keyword_forceinline)); - SmallVector ParamsTarget; + SmallVector ParamsTarget; if (getLangOpts().OpenMPIsTargetDevice) ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy)); ParamsTarget.push_back( std::make_pair(StringRef(), QualType())); // __context with shared vars; // Start a captured region for 'target' with no implicit parameters. - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - ParamsTarget, /*OpenMPCaptureLevel=*/1); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, ParamsTarget, + /*OpenMPCaptureLevel=*/1); - Sema::CapturedParamNameType ParamsTeams[] = { + SemaOpenMP::CapturedParamNameType ParamsTeams[] = { std::make_pair(".global_tid.", KmpInt32PtrTy), std::make_pair(".bound_tid.", KmpInt32PtrTy), std::make_pair(StringRef(), QualType()) // __context with shared vars }; // Start a captured region for 'target' with no implicit parameters. - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - ParamsTeams, /*OpenMPCaptureLevel=*/2); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, ParamsTeams, + /*OpenMPCaptureLevel=*/2); - Sema::CapturedParamNameType ParamsParallel[] = { + SemaOpenMP::CapturedParamNameType ParamsParallel[] = { std::make_pair(".global_tid.", KmpInt32PtrTy), std::make_pair(".bound_tid.", KmpInt32PtrTy), std::make_pair(".previous.lb.", Context.getSizeType().withConst()), @@ -4536,8 +4559,9 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { }; // Start a captured region for 'teams' or 'parallel'. Both regions have // the same implicit parameters. - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - ParamsParallel, /*OpenMPCaptureLevel=*/3); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, ParamsParallel, + /*OpenMPCaptureLevel=*/3); break; } @@ -4548,16 +4572,17 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { QualType KmpInt32PtrTy = Context.getPointerType(KmpInt32Ty).withConst().withRestrict(); - Sema::CapturedParamNameType ParamsTeams[] = { + SemaOpenMP::CapturedParamNameType ParamsTeams[] = { std::make_pair(".global_tid.", KmpInt32PtrTy), std::make_pair(".bound_tid.", KmpInt32PtrTy), std::make_pair(StringRef(), QualType()) // __context with shared vars }; // Start a captured region for 'target' with no implicit parameters. - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - ParamsTeams, /*OpenMPCaptureLevel=*/0); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, ParamsTeams, + /*OpenMPCaptureLevel=*/0); - Sema::CapturedParamNameType ParamsParallel[] = { + SemaOpenMP::CapturedParamNameType ParamsParallel[] = { std::make_pair(".global_tid.", KmpInt32PtrTy), std::make_pair(".bound_tid.", KmpInt32PtrTy), std::make_pair(".previous.lb.", Context.getSizeType().withConst()), @@ -4566,8 +4591,9 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { }; // Start a captured region for 'teams' or 'parallel'. Both regions have // the same implicit parameters. - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - ParamsParallel, /*OpenMPCaptureLevel=*/1); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, ParamsParallel, + /*OpenMPCaptureLevel=*/1); break; } case OMPD_target_update: @@ -4581,7 +4607,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { FunctionProtoType::ExtProtoInfo EPI; EPI.Variadic = true; QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI); - Sema::CapturedParamNameType Params[] = { + SemaOpenMP::CapturedParamNameType Params[] = { std::make_pair(".global_tid.", KmpInt32Ty), std::make_pair(".part_id.", KmpInt32PtrTy), std::make_pair(".privates.", VoidPtrTy), @@ -4591,11 +4617,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { std::make_pair(".task_t.", Context.VoidPtrTy.withConst()), std::make_pair(StringRef(), QualType()) // __context with shared vars }; - ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP, - Params); + SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, + CR_OpenMP, Params); // Mark this captured region as inlined, because we don't use outlined // function directly. - getCurCapturedRegion()->TheCapturedDecl->addAttr( + SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr( AlwaysInlineAttr::CreateImplicit( Context, {}, AlwaysInlineAttr::Keyword_forceinline)); break; @@ -4626,15 +4652,15 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { default: llvm_unreachable("Unknown OpenMP directive"); } - DSAStack->setContext(CurContext); + DSAStack->setContext(SemaRef.CurContext); handleDeclareVariantConstructTrait(DSAStack, DKind, /* ScopeEntry */ true); } -int Sema::getNumberOfConstructScopes(unsigned Level) const { +int SemaOpenMP::getNumberOfConstructScopes(unsigned Level) const { return getOpenMPCaptureLevels(DSAStack->getDirective(Level)); } -int Sema::getOpenMPCaptureLevels(OpenMPDirectiveKind DKind) { +int SemaOpenMP::getOpenMPCaptureLevels(OpenMPDirectiveKind DKind) { SmallVector CaptureRegions; getOpenMPCaptureRegions(CaptureRegions, DKind); return CaptureRegions.size(); @@ -4674,7 +4700,7 @@ static OMPCapturedExprDecl *buildCaptureDecl(Sema &S, IdentifierInfo *Id, static DeclRefExpr *buildCapture(Sema &S, ValueDecl *D, Expr *CaptureExpr, bool WithInit) { OMPCapturedExprDecl *CD; - if (VarDecl *VD = S.isOpenMPCapturedDecl(D)) + if (VarDecl *VD = S.OpenMP().isOpenMPCapturedDecl(D)) CD = cast(VD); else CD = buildCaptureDecl(S, D->getIdentifier(), CaptureExpr, WithInit, @@ -4726,7 +4752,7 @@ class CaptureRegionUnwinderRAII { : S(S), ErrorFound(ErrorFound), DKind(DKind) {} ~CaptureRegionUnwinderRAII() { if (ErrorFound) { - int ThisCaptureLevel = S.getOpenMPCaptureLevels(DKind); + int ThisCaptureLevel = S.OpenMP().getOpenMPCaptureLevels(DKind); while (--ThisCaptureLevel >= 0) S.ActOnCapturedRegionError(); } @@ -4734,10 +4760,10 @@ class CaptureRegionUnwinderRAII { }; } // namespace -void Sema::tryCaptureOpenMPLambdas(ValueDecl *V) { +void SemaOpenMP::tryCaptureOpenMPLambdas(ValueDecl *V) { // Capture variables captured by reference in lambdas for target-based // directives. - if (!CurContext->isDependentContext() && + if (!SemaRef.CurContext->isDependentContext() && (isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()) || isOpenMPTargetDataManagementDirective( DSAStack->getCurrentDirective()))) { @@ -4757,14 +4783,14 @@ void Sema::tryCaptureOpenMPLambdas(ValueDecl *V) { if (LC.getCaptureKind() == LCK_ByRef) { VarDecl *VD = cast(LC.getCapturedVar()); DeclContext *VDC = VD->getDeclContext(); - if (!VDC->Encloses(CurContext)) + if (!VDC->Encloses(SemaRef.CurContext)) continue; - MarkVariableReferenced(LC.getLocation(), VD); + SemaRef.MarkVariableReferenced(LC.getLocation(), VD); } else if (LC.getCaptureKind() == LCK_This) { - QualType ThisTy = getCurrentThisType(); - if (!ThisTy.isNull() && - Context.typesAreCompatible(ThisTy, ThisCapture->getType())) - CheckCXXThisCapture(LC.getLocation()); + QualType ThisTy = SemaRef.getCurrentThisType(); + if (!ThisTy.isNull() && getASTContext().typesAreCompatible( + ThisTy, ThisCapture->getType())) + SemaRef.CheckCXXThisCapture(LC.getLocation()); } } } @@ -4804,8 +4830,8 @@ static bool checkOrderedOrderSpecified(Sema &S, return false; } -StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, - ArrayRef Clauses) { +StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S, + ArrayRef Clauses) { handleDeclareVariantConstructTrait(DSAStack, DSAStack->getCurrentDirective(), /* ScopeEntry */ false); if (DSAStack->getCurrentDirective() == OMPD_atomic || @@ -4817,7 +4843,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, bool ErrorFound = false; CaptureRegionUnwinderRAII CaptureRegionUnwinder( - *this, ErrorFound, DSAStack->getCurrentDirective()); + SemaRef, ErrorFound, DSAStack->getCurrentDirective()); if (!S.isUsable()) { ErrorFound = true; return StmtError(); @@ -4831,7 +4857,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, SmallVector PICs; // This is required for proper codegen. for (OMPClause *Clause : Clauses) { - if (!LangOpts.OpenMPSimd && + if (!getLangOpts().OpenMPSimd && (isOpenMPTaskingDirective(DSAStack->getCurrentDirective()) || DSAStack->getCurrentDirective() == OMPD_target) && Clause->getClauseKind() == OMPC_in_reduction) { @@ -4840,7 +4866,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, auto *IRC = cast(Clause); for (Expr *E : IRC->taskgroup_descriptors()) if (E) - MarkDeclarationsReferencedInExpr(E); + SemaRef.MarkDeclarationsReferencedInExpr(E); } if (isOpenMPPrivate(Clause->getClauseKind()) || Clause->getClauseKind() == OMPC_copyprivate || @@ -4851,7 +4877,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, // Mark all variables in private list clauses as used in inner region. for (Stmt *VarRef : Clause->children()) { if (auto *E = cast_or_null(VarRef)) { - MarkDeclarationsReferencedInExpr(E); + SemaRef.MarkDeclarationsReferencedInExpr(E); } } DSAStack->setForceVarCapturing(/*V=*/false); @@ -4865,7 +4891,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, PICs.push_back(C); if (auto *C = OMPClauseWithPostUpdate::get(Clause)) { if (Expr *E = C->getPostUpdateExpr()) - MarkDeclarationsReferencedInExpr(E); + SemaRef.MarkDeclarationsReferencedInExpr(E); } } if (Clause->getClauseKind() == OMPC_schedule) @@ -4877,7 +4903,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, } // Capture allocator expressions if used. for (Expr *E : DSAStack->getInnerAllocators()) - MarkDeclarationsReferencedInExpr(E); + SemaRef.MarkDeclarationsReferencedInExpr(E); // OpenMP, 2.7.1 Loop Construct, Restrictions // The nonmonotonic modifier cannot be specified if an ordered clause is // specified. @@ -4899,7 +4925,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, // OpenMP 5.0, 2.9.2 Worksharing-Loop Construct, Restrictions. // If an order(concurrent) clause is present, an ordered clause may not appear // on the same directive. - if (checkOrderedOrderSpecified(*this, Clauses)) + if (checkOrderedOrderSpecified(SemaRef, Clauses)) ErrorFound = true; if (!LCs.empty() && OC && OC->getNumForLoops()) { for (const OMPLinearClause *C : LCs) { @@ -4936,7 +4962,8 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, CaptureRegion == OMPD_unknown) { if (auto *DS = cast_or_null(C->getPreInitStmt())) { for (Decl *D : DS->decls()) - MarkVariableReferenced(D->getLocation(), cast(D)); + SemaRef.MarkVariableReferenced(D->getLocation(), + cast(D)); } } } @@ -4950,7 +4977,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, ++I) { OMPUsesAllocatorsClause::Data D = UAC->getAllocatorData(I); if (Expr *E = D.AllocatorTraits) - MarkDeclarationsReferencedInExpr(E); + SemaRef.MarkDeclarationsReferencedInExpr(E); } continue; } @@ -4965,17 +4992,17 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S, continue; for (Expr *E : RC->copy_array_temps()) if (E) - MarkDeclarationsReferencedInExpr(E); + SemaRef.MarkDeclarationsReferencedInExpr(E); } if (auto *AC = dyn_cast(C)) { for (Expr *E : AC->varlists()) - MarkDeclarationsReferencedInExpr(E); + SemaRef.MarkDeclarationsReferencedInExpr(E); } } } if (++CompletedRegions == CaptureRegions.size()) DSAStack->setBodyComplete(); - SR = ActOnCapturedRegionEnd(SR.get()); + SR = SemaRef.ActOnCapturedRegionEnd(SR.get()); } return SR; } @@ -5782,9 +5809,9 @@ static CapturedStmt *buildLoopVarFunc(Sema &Actions, QualType LoopVarTy, // the OpenMPIRBuilder to know additional C/C++ semantics, such as how to // invoke a copy constructor. QualType TargetParamTy = Ctx.getLValueReferenceType(LoopVarTy); - Sema::CapturedParamNameType Params[] = {{"LoopVar", TargetParamTy}, - {"Logical", LogicalTy}, - {StringRef(), QualType()}}; + SemaOpenMP::CapturedParamNameType Params[] = {{"LoopVar", TargetParamTy}, + {"Logical", LogicalTy}, + {StringRef(), QualType()}}; Actions.ActOnCapturedRegionStart({}, nullptr, CR_Default, Params); // Capture the initial iterator which represents the LoopVar value at the @@ -5835,7 +5862,7 @@ static CapturedStmt *buildLoopVarFunc(Sema &Actions, QualType LoopVarTy, AssertSuccess(Actions.ActOnCapturedRegionEnd(Body))); } -StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) { +StmtResult SemaOpenMP::ActOnOpenMPCanonicalLoop(Stmt *AStmt) { ASTContext &Ctx = getASTContext(); // Extract the common elements of ForStmt and CXXForRangeStmt: @@ -5946,8 +5973,8 @@ StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) { if (IncBin->getOpcode() == BO_AddAssign) { Step = IncBin->getRHS(); } else if (IncBin->getOpcode() == BO_SubAssign) { - Step = - AssertSuccess(BuildUnaryOp(nullptr, {}, UO_Minus, IncBin->getRHS())); + Step = AssertSuccess( + SemaRef.BuildUnaryOp(nullptr, {}, UO_Minus, IncBin->getRHS())); } else llvm_unreachable("unhandled binary increment operator"); } else if (auto *CondCXXOp = dyn_cast(Inc)) { @@ -5965,7 +5992,7 @@ StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) { break; case OO_MinusEqual: Step = AssertSuccess( - BuildUnaryOp(nullptr, {}, UO_Minus, CondCXXOp->getArg(1))); + SemaRef.BuildUnaryOp(nullptr, {}, UO_Minus, CondCXXOp->getArg(1))); break; default: llvm_unreachable("unhandled overloaded increment operator"); @@ -5974,16 +6001,17 @@ StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) { llvm_unreachable("unknown increment expression"); CapturedStmt *DistanceFunc = - buildDistanceFunc(*this, LogicalTy, CondRel, LHS, RHS, Step); + buildDistanceFunc(SemaRef, LogicalTy, CondRel, LHS, RHS, Step); CapturedStmt *LoopVarFunc = buildLoopVarFunc( - *this, LVTy, LogicalTy, CounterRef, Step, isa(AStmt)); - DeclRefExpr *LVRef = BuildDeclRefExpr(LUVDecl, LUVDecl->getType(), VK_LValue, - {}, nullptr, nullptr, {}, nullptr); + SemaRef, LVTy, LogicalTy, CounterRef, Step, isa(AStmt)); + DeclRefExpr *LVRef = + SemaRef.BuildDeclRefExpr(LUVDecl, LUVDecl->getType(), VK_LValue, {}, + nullptr, nullptr, {}, nullptr); return OMPCanonicalLoop::create(getASTContext(), AStmt, DistanceFunc, LoopVarFunc, LVRef); } -StmtResult Sema::ActOnOpenMPLoopnest(Stmt *AStmt) { +StmtResult SemaOpenMP::ActOnOpenMPLoopnest(Stmt *AStmt) { // Handle a literal loop. if (isa(AStmt) || isa(AStmt)) return ActOnOpenMPCanonicalLoop(AStmt); @@ -6128,7 +6156,7 @@ processImplicitMapsWithDefaultMappers(Sema &S, DSAStackTy *Stack, continue; CXXScopeSpec MapperIdScopeSpec; DeclarationNameInfo MapperId; - if (OMPClause *NewClause = S.ActOnOpenMPMapClause( + if (OMPClause *NewClause = S.OpenMP().ActOnOpenMPMapClause( nullptr, C->getMapTypeModifiers(), C->getMapTypeModifiersLoc(), MapperIdScopeSpec, MapperId, C->getMapType(), /*IsMapTypeImplicit=*/true, SourceLocation(), SourceLocation(), @@ -6210,14 +6238,12 @@ static bool teamsLoopCanBeParallelFor(Stmt *AStmt, Sema &SemaRef) { return Checker.teamsLoopCanBeParallelFor(); } -bool Sema::mapLoopConstruct(llvm::SmallVector &ClausesWithoutBind, - ArrayRef Clauses, - OpenMPBindClauseKind &BindKind, - OpenMPDirectiveKind &Kind, - OpenMPDirectiveKind &PrevMappedDirective, - SourceLocation StartLoc, SourceLocation EndLoc, - const DeclarationNameInfo &DirName, - OpenMPDirectiveKind CancelRegion) { +bool SemaOpenMP::mapLoopConstruct( + llvm::SmallVector &ClausesWithoutBind, + ArrayRef Clauses, OpenMPBindClauseKind &BindKind, + OpenMPDirectiveKind &Kind, OpenMPDirectiveKind &PrevMappedDirective, + SourceLocation StartLoc, SourceLocation EndLoc, + const DeclarationNameInfo &DirName, OpenMPDirectiveKind CancelRegion) { bool UseClausesWithoutBind = false; @@ -6299,7 +6325,7 @@ bool Sema::mapLoopConstruct(llvm::SmallVector &ClausesWithoutBind, return UseClausesWithoutBind; } -StmtResult Sema::ActOnOpenMPExecutableDirective( +StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective( OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName, OpenMPDirectiveKind CancelRegion, ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, @@ -6324,8 +6350,8 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( } // First check CancelRegion which is then used in checkNestingOfRegions. - if (checkCancelRegion(*this, Kind, CancelRegion, StartLoc) || - checkNestingOfRegions(*this, DSAStack, DK, DirName, CancelRegion, + if (checkCancelRegion(SemaRef, Kind, CancelRegion, StartLoc) || + checkNestingOfRegions(SemaRef, DSAStack, DK, DirName, CancelRegion, BindKind, StartLoc)) { return StmtError(); } @@ -6344,13 +6370,14 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( } else { ClausesWithImplicit.append(Clauses.begin(), Clauses.end()); } - if (AStmt && !CurContext->isDependentContext() && Kind != OMPD_atomic && - Kind != OMPD_critical && Kind != OMPD_section && Kind != OMPD_master && - Kind != OMPD_masked && !isOpenMPLoopTransformationDirective(Kind)) { + if (AStmt && !SemaRef.CurContext->isDependentContext() && + Kind != OMPD_atomic && Kind != OMPD_critical && Kind != OMPD_section && + Kind != OMPD_master && Kind != OMPD_masked && + !isOpenMPLoopTransformationDirective(Kind)) { assert(isa(AStmt) && "Captured statement expected"); // Check default data sharing attributes for referenced variables. - DSAAttrChecker DSAChecker(DSAStack, *this, cast(AStmt)); + DSAAttrChecker DSAChecker(DSAStack, SemaRef, cast(AStmt)); int ThisCaptureLevel = getOpenMPCaptureLevels(Kind); Stmt *S = AStmt; while (--ThisCaptureLevel >= 0) @@ -6490,8 +6517,8 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( } // Build expressions for implicit maps of data members with 'default' // mappers. - if (LangOpts.OpenMP >= 50) - processImplicitMapsWithDefaultMappers(*this, DSAStack, + if (getLangOpts().OpenMP >= 50) + processImplicitMapsWithDefaultMappers(SemaRef, DSAStack, ClausesWithImplicit); } @@ -6505,7 +6532,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( case OMPD_simd: Res = ActOnOpenMPSimdDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_tile: @@ -6523,7 +6550,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( case OMPD_for_simd: Res = ActOnOpenMPForSimdDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_sections: @@ -6561,7 +6588,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( Res = ActOnOpenMPParallelForSimdDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); AllowedNameModifiers.push_back(OMPD_parallel); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_scope: @@ -6698,7 +6725,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( Res = ActOnOpenMPTaskLoopSimdDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); AllowedNameModifiers.push_back(OMPD_taskloop); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_master_taskloop: @@ -6715,13 +6742,13 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( Res = ActOnOpenMPMasterTaskLoopSimdDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); AllowedNameModifiers.push_back(OMPD_taskloop); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_masked_taskloop_simd: Res = ActOnOpenMPMaskedTaskLoopSimdDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); - if (LangOpts.OpenMP >= 51) { + if (getLangOpts().OpenMP >= 51) { AllowedNameModifiers.push_back(OMPD_taskloop); AllowedNameModifiers.push_back(OMPD_simd); } @@ -6735,7 +6762,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( case OMPD_parallel_masked_taskloop: Res = ActOnOpenMPParallelMaskedTaskLoopDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); - if (LangOpts.OpenMP >= 51) { + if (getLangOpts().OpenMP >= 51) { AllowedNameModifiers.push_back(OMPD_taskloop); AllowedNameModifiers.push_back(OMPD_parallel); } @@ -6745,13 +6772,13 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); AllowedNameModifiers.push_back(OMPD_taskloop); AllowedNameModifiers.push_back(OMPD_parallel); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_parallel_masked_taskloop_simd: Res = ActOnOpenMPParallelMaskedTaskLoopSimdDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); - if (LangOpts.OpenMP >= 51) { + if (getLangOpts().OpenMP >= 51) { AllowedNameModifiers.push_back(OMPD_taskloop); AllowedNameModifiers.push_back(OMPD_parallel); AllowedNameModifiers.push_back(OMPD_simd); @@ -6775,13 +6802,13 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( Res = ActOnOpenMPDistributeParallelForSimdDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); AllowedNameModifiers.push_back(OMPD_parallel); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_distribute_simd: Res = ActOnOpenMPDistributeSimdDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_target_parallel_for_simd: @@ -6789,14 +6816,14 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); AllowedNameModifiers.push_back(OMPD_target); AllowedNameModifiers.push_back(OMPD_parallel); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_target_simd: Res = ActOnOpenMPTargetSimdDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); AllowedNameModifiers.push_back(OMPD_target); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_teams_distribute: @@ -6806,14 +6833,14 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( case OMPD_teams_distribute_simd: Res = ActOnOpenMPTeamsDistributeSimdDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_teams_distribute_parallel_for_simd: Res = ActOnOpenMPTeamsDistributeParallelForSimdDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); AllowedNameModifiers.push_back(OMPD_parallel); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_teams_distribute_parallel_for: @@ -6842,14 +6869,14 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); AllowedNameModifiers.push_back(OMPD_target); AllowedNameModifiers.push_back(OMPD_parallel); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_target_teams_distribute_simd: Res = ActOnOpenMPTargetTeamsDistributeSimdDirective( ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); AllowedNameModifiers.push_back(OMPD_target); - if (LangOpts.OpenMP >= 50) + if (getLangOpts().OpenMP >= 50) AllowedNameModifiers.push_back(OMPD_simd); break; case OMPD_interop: @@ -6906,7 +6933,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( if (DSAStack->getDefaultDSA() == DSA_none || DSAStack->getDefaultDSA() == DSA_private || DSAStack->getDefaultDSA() == DSA_firstprivate) { - DSAAttrChecker DSAChecker(DSAStack, *this, nullptr); + DSAAttrChecker DSAChecker(DSAStack, SemaRef, nullptr); for (OMPClause *C : Clauses) { switch (C->getClauseKind()) { case OMPC_num_threads: @@ -7043,13 +7070,13 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( } if (!AllowedNameModifiers.empty()) - ErrorFound = checkIfClauses(*this, Kind, Clauses, AllowedNameModifiers) || + ErrorFound = checkIfClauses(SemaRef, Kind, Clauses, AllowedNameModifiers) || ErrorFound; if (ErrorFound) return StmtError(); - if (!CurContext->isDependentContext() && + if (!SemaRef.CurContext->isDependentContext() && isOpenMPTargetExecutionDirective(Kind) && !(DSAStack->hasRequiresDeclWithClause() || DSAStack->hasRequiresDeclWithClause() || @@ -7062,7 +7089,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( return Res; } -Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareSimdDirective( +SemaOpenMP::DeclGroupPtrTy SemaOpenMP::ActOnOpenMPDeclareSimdDirective( DeclGroupPtrTy DG, OMPDeclareSimdDeclAttr::BranchStateTy BS, Expr *Simdlen, ArrayRef Uniforms, ArrayRef Aligneds, ArrayRef Alignments, ArrayRef Linears, @@ -7297,13 +7324,15 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareSimdDirective( NewStep = PerformOpenMPImplicitIntegerConversion(Step->getExprLoc(), Step) .get(); if (NewStep) - NewStep = - VerifyIntegerConstantExpression(NewStep, /*FIXME*/ AllowFold).get(); + NewStep = SemaRef + .VerifyIntegerConstantExpression( + NewStep, /*FIXME*/ Sema::AllowFold) + .get(); } NewSteps.push_back(NewStep); } auto *NewAttr = OMPDeclareSimdDeclAttr::CreateImplicit( - Context, BS, SL.get(), const_cast(Uniforms.data()), + getASTContext(), BS, SL.get(), const_cast(Uniforms.data()), Uniforms.size(), const_cast(Aligneds.data()), Aligneds.size(), const_cast(NewAligns.data()), NewAligns.size(), const_cast(Linears.data()), Linears.size(), @@ -7336,7 +7365,7 @@ static void setPrototype(Sema &S, FunctionDecl *FD, FunctionDecl *FDWithProto, FD->setParams(Params); } -void Sema::ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D) { +void SemaOpenMP::ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D) { if (D->isInvalidDecl()) return; FunctionDecl *FD = nullptr; @@ -7349,7 +7378,7 @@ void Sema::ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D) { // If we are instantiating templates we do *not* apply scoped assumptions but // only global ones. We apply scoped assumption to the template definition // though. - if (!inTemplateInstantiation()) { + if (!SemaRef.inTemplateInstantiation()) { for (OMPAssumeAttr *AA : OMPAssumeScoped) FD->addAttr(AA); } @@ -7357,10 +7386,10 @@ void Sema::ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D) { FD->addAttr(AA); } -Sema::OMPDeclareVariantScope::OMPDeclareVariantScope(OMPTraitInfo &TI) +SemaOpenMP::OMPDeclareVariantScope::OMPDeclareVariantScope(OMPTraitInfo &TI) : TI(&TI), NameSuffix(TI.getMangledName()) {} -void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( +void SemaOpenMP::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( Scope *S, Declarator &D, MultiTemplateParamsArg TemplateParamLists, SmallVectorImpl &Bases) { if (!D.getIdentifier()) @@ -7376,11 +7405,11 @@ void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( return; const IdentifierInfo *BaseII = D.getIdentifier(); - LookupResult Lookup(*this, DeclarationName(BaseII), D.getIdentifierLoc(), - LookupOrdinaryName); - LookupParsedName(Lookup, S, &D.getCXXScopeSpec()); + LookupResult Lookup(SemaRef, DeclarationName(BaseII), D.getIdentifierLoc(), + Sema::LookupOrdinaryName); + SemaRef.LookupParsedName(Lookup, S, &D.getCXXScopeSpec()); - TypeSourceInfo *TInfo = GetTypeForDeclarator(D); + TypeSourceInfo *TInfo = SemaRef.GetTypeForDeclarator(D); QualType FType = TInfo->getType(); bool IsConstexpr = @@ -7409,7 +7438,7 @@ void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( QualType UDeclTy = UDecl->getType(); if (!UDeclTy->isDependentType()) { - QualType NewType = Context.mergeFunctionTypes( + QualType NewType = getASTContext().mergeFunctionTypes( FType, UDeclTy, /* OfBlockPointer */ false, /* Unqualified */ false, /* AllowCXX */ true); if (NewType.isNull()) @@ -7425,7 +7454,7 @@ void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( // If no base was found we create a declaration that we use as base. if (Bases.empty() && UseImplicitBase) { D.setFunctionDefinitionKind(FunctionDefinitionKind::Declaration); - Decl *BaseD = HandleDeclarator(S, D, TemplateParamLists); + Decl *BaseD = SemaRef.HandleDeclarator(S, D, TemplateParamLists); BaseD->setImplicit(true); if (auto *BaseTemplD = dyn_cast(BaseD)) Bases.push_back(BaseTemplD->getTemplatedDecl()); @@ -7437,18 +7466,18 @@ void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope( MangledName += D.getIdentifier()->getName(); MangledName += getOpenMPVariantManglingSeparatorStr(); MangledName += DVScope.NameSuffix; - IdentifierInfo &VariantII = Context.Idents.get(MangledName); + IdentifierInfo &VariantII = getASTContext().Idents.get(MangledName); VariantII.setMangledOpenMPVariantName(true); D.SetIdentifier(&VariantII, D.getBeginLoc()); } -void Sema::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope( +void SemaOpenMP::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope( Decl *D, SmallVectorImpl &Bases) { // Do not mark function as is used to prevent its emission if this is the // only place where it is used. EnterExpressionEvaluationContext Unevaluated( - *this, Sema::ExpressionEvaluationContext::Unevaluated); + SemaRef, Sema::ExpressionEvaluationContext::Unevaluated); FunctionDecl *FD = nullptr; if (auto *UTemplDecl = dyn_cast(D)) @@ -7456,14 +7485,14 @@ void Sema::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope( else FD = cast(D); auto *VariantFuncRef = DeclRefExpr::Create( - Context, NestedNameSpecifierLoc(), SourceLocation(), FD, + getASTContext(), NestedNameSpecifierLoc(), SourceLocation(), FD, /* RefersToEnclosingVariableOrCapture */ false, /* NameLoc */ FD->getLocation(), FD->getType(), ExprValueKind::VK_PRValue); OMPDeclareVariantScope &DVScope = OMPDeclareVariantScopes.back(); auto *OMPDeclareVariantA = OMPDeclareVariantAttr::CreateImplicit( - Context, VariantFuncRef, DVScope.TI, + getASTContext(), VariantFuncRef, DVScope.TI, /*NothingArgs=*/nullptr, /*NothingArgsSize=*/0, /*NeedDevicePtrArgs=*/nullptr, /*NeedDevicePtrArgsSize=*/0, /*AppendArgs=*/nullptr, /*AppendArgsSize=*/0); @@ -7471,10 +7500,11 @@ void Sema::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope( BaseFD->addAttr(OMPDeclareVariantA); } -ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope, - SourceLocation LParenLoc, - MultiExprArg ArgExprs, - SourceLocation RParenLoc, Expr *ExecConfig) { +ExprResult SemaOpenMP::ActOnOpenMPCall(ExprResult Call, Scope *Scope, + SourceLocation LParenLoc, + MultiExprArg ArgExprs, + SourceLocation RParenLoc, + Expr *ExecConfig) { // The common case is a regular call we do not want to specialize at all. Try // to make that case fast by bailing early. CallExpr *CE = dyn_cast(Call.get()); @@ -7485,7 +7515,7 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope, if (!CalleeFnDecl) return Call; - if (LangOpts.OpenMP >= 51 && CalleeFnDecl->getIdentifier() && + if (getLangOpts().OpenMP >= 51 && CalleeFnDecl->getIdentifier() && CalleeFnDecl->getName().starts_with_insensitive("omp_")) { // checking for any calls inside an Order region if (Scope && Scope->isOpenMPOrderClauseScope()) @@ -7504,7 +7534,8 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope, << ISATrait; }; TargetOMPContext OMPCtx(Context, std::move(DiagUnknownTrait), - getCurFunctionDecl(), DSAStack->getConstructTraits()); + SemaRef.getCurFunctionDecl(), + DSAStack->getConstructTraits()); QualType CalleeFnType = CalleeFnDecl->getType(); @@ -7549,7 +7580,7 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope, // different type than the base function. This is intended and OK but if // we cannot create a call the difference is not in the "implementation // defined range" we allow. - Sema::TentativeAnalysisScope Trap(*this); + Sema::TentativeAnalysisScope Trap(SemaRef); if (auto *SpecializedMethod = dyn_cast(BestDecl)) { auto *MemberCall = dyn_cast(CE); @@ -7558,12 +7589,12 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope, /* IsArrow */ false, SpecializedMethod, Context.BoundMemberTy, MemberCall->getValueKind(), MemberCall->getObjectKind()); } - NewCall = BuildCallExpr(Scope, BestExpr, LParenLoc, ArgExprs, RParenLoc, - ExecConfig); + NewCall = SemaRef.BuildCallExpr(Scope, BestExpr, LParenLoc, ArgExprs, + RParenLoc, ExecConfig); if (NewCall.isUsable()) { if (CallExpr *NCE = dyn_cast(NewCall.get())) { FunctionDecl *NewCalleeFnDecl = NCE->getDirectCallee(); - QualType NewType = Context.mergeFunctionTypes( + QualType NewType = getASTContext().mergeFunctionTypes( CalleeFnType, NewCalleeFnDecl->getType(), /* OfBlockPointer */ false, /* Unqualified */ false, /* AllowCXX */ true); @@ -7581,14 +7612,16 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope, if (!NewCall.isUsable()) return Call; - return PseudoObjectExpr::Create(Context, CE, {NewCall.get()}, 0); + return PseudoObjectExpr::Create(getASTContext(), CE, {NewCall.get()}, 0); } std::optional> -Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, - Expr *VariantRef, OMPTraitInfo &TI, - unsigned NumAppendArgs, - SourceRange SR) { +SemaOpenMP::checkOpenMPDeclareVariantFunction(SemaOpenMP::DeclGroupPtrTy DG, + Expr *VariantRef, + OMPTraitInfo &TI, + unsigned NumAppendArgs, + SourceRange SR) { + ASTContext &Context = getASTContext(); if (!DG || DG.get().isNull()) return std::nullopt; @@ -7631,7 +7664,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, // Check if the function was emitted already. const FunctionDecl *Definition; if (!FD->isThisDeclarationADefinition() && FD->isDefined(Definition) && - (LangOpts.EmitAllDecls || Context.DeclMustBeEmitted(Definition))) + (getLangOpts().EmitAllDecls || Context.DeclMustBeEmitted(Definition))) Diag(SR.getBegin(), diag::warn_omp_declare_variant_after_emitted) << FD->getLocation(); @@ -7654,7 +7687,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, // Deal with non-constant score and user condition expressions. auto HandleNonConstantScoresAndConditions = [this](Expr *&E, bool IsScore) -> bool { - if (!E || E->isIntegerConstantExpr(Context)) + if (!E || E->isIntegerConstantExpr(getASTContext())) return false; if (IsScore) { @@ -7686,9 +7719,9 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, // Adjust the function type to account for an extra omp_interop_t for each // specified in the append_args clause. const TypeDecl *TD = nullptr; - LookupResult Result(*this, &Context.Idents.get("omp_interop_t"), + LookupResult Result(SemaRef, &Context.Idents.get("omp_interop_t"), SR.getBegin(), Sema::LookupOrdinaryName); - if (LookupName(Result, getCurScope())) { + if (SemaRef.LookupName(Result, SemaRef.getCurScope())) { NamedDecl *ND = Result.getFoundDecl(); TD = dyn_cast_or_null(ND); } @@ -7711,7 +7744,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, // Convert VariantRef expression to the type of the original function to // resolve possible conflicts. ExprResult VariantRefCast = VariantRef; - if (LangOpts.CPlusPlus) { + if (getLangOpts().CPlusPlus) { QualType FnPtrType; auto *Method = dyn_cast(FD); if (Method && !Method->isStatic()) { @@ -7722,9 +7755,9 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, { // Build adrr_of unary op to correctly handle type checks for member // functions. - Sema::TentativeAnalysisScope Trap(*this); - ER = CreateBuiltinUnaryOp(VariantRef->getBeginLoc(), UO_AddrOf, - VariantRef); + Sema::TentativeAnalysisScope Trap(SemaRef); + ER = SemaRef.CreateBuiltinUnaryOp(VariantRef->getBeginLoc(), UO_AddrOf, + VariantRef); } if (!ER.isUsable()) { Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected) @@ -7737,9 +7770,9 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, } QualType VarianPtrType = Context.getPointerType(VariantRef->getType()); if (VarianPtrType.getUnqualifiedType() != FnPtrType.getUnqualifiedType()) { - ImplicitConversionSequence ICS = TryImplicitConversion( + ImplicitConversionSequence ICS = SemaRef.TryImplicitConversion( VariantRef, FnPtrType.getUnqualifiedType(), - /*SuppressUserConversions=*/false, AllowedExplicit::None, + /*SuppressUserConversions=*/false, Sema::AllowedExplicit::None, /*InOverloadResolution=*/false, /*CStyle=*/false, /*AllowObjCWritebackConversion=*/false); @@ -7751,8 +7784,8 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, << (NumAppendArgs ? 1 : 0) << VariantRef->getSourceRange(); return std::nullopt; } - VariantRefCast = PerformImplicitConversion( - VariantRef, FnPtrType.getUnqualifiedType(), AA_Converting); + VariantRefCast = SemaRef.PerformImplicitConversion( + VariantRef, FnPtrType.getUnqualifiedType(), Sema::AA_Converting); if (!VariantRefCast.isUsable()) return std::nullopt; } @@ -7765,7 +7798,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, } } - ExprResult ER = CheckPlaceholderExpr(VariantRefCast.get()); + ExprResult ER = SemaRef.CheckPlaceholderExpr(VariantRefCast.get()); if (!ER.isUsable() || !ER.get()->IgnoreParenImpCasts()->getType()->isFunctionType()) { Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected) @@ -7795,7 +7828,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, } // Check if function types are compatible in C. - if (!LangOpts.CPlusPlus) { + if (!getLangOpts().CPlusPlus) { QualType NewType = Context.mergeFunctionTypes(AdjustedFnType, NewFD->getType()); if (NewType.isNull()) { @@ -7807,9 +7840,9 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, } if (NewType->isFunctionProtoType()) { if (FD->getType()->isFunctionNoProtoType()) - setPrototype(*this, FD, NewFD, NewType); + setPrototype(SemaRef, FD, NewFD, NewType); else if (NewFD->getType()->isFunctionNoProtoType()) - setPrototype(*this, NewFD, FD, NewType); + setPrototype(SemaRef, NewFD, FD, NewType); } } @@ -7872,15 +7905,15 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, } // Check general compatibility. - if (areMultiversionVariantFunctionsCompatible( + if (SemaRef.areMultiversionVariantFunctionsCompatible( FD, NewFD, PartialDiagnostic::NullDiagnostic(), PartialDiagnosticAt(SourceLocation(), PartialDiagnostic::NullDiagnostic()), PartialDiagnosticAt( VariantRef->getExprLoc(), - PDiag(diag::err_omp_declare_variant_doesnt_support)), + SemaRef.PDiag(diag::err_omp_declare_variant_doesnt_support)), PartialDiagnosticAt(VariantRef->getExprLoc(), - PDiag(diag::err_omp_declare_variant_diff) + SemaRef.PDiag(diag::err_omp_declare_variant_diff) << FD->getLocation()), /*TemplatesSupported=*/true, /*ConstexprSupported=*/false, /*CLinkageMayDiffer=*/true)) @@ -7888,7 +7921,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG, return std::make_pair(FD, cast(DRE)); } -void Sema::ActOnOpenMPDeclareVariantDirective( +void SemaOpenMP::ActOnOpenMPDeclareVariantDirective( FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI, ArrayRef AdjustArgsNothing, ArrayRef AdjustArgsNeedDevicePtr, @@ -7906,7 +7939,7 @@ void Sema::ActOnOpenMPDeclareVariantDirective( if (!AllAdjustArgs.empty() || !AppendArgs.empty()) { VariantMatchInfo VMI; - TI.getAsVariantMatchInfo(Context, VMI); + TI.getAsVariantMatchInfo(getASTContext(), VMI); if (!llvm::is_contained( VMI.ConstructTraits, llvm::omp::TraitProperty::construct_dispatch_dispatch)) { @@ -7949,18 +7982,18 @@ void Sema::ActOnOpenMPDeclareVariantDirective( } auto *NewAttr = OMPDeclareVariantAttr::CreateImplicit( - Context, VariantRef, &TI, const_cast(AdjustArgsNothing.data()), - AdjustArgsNothing.size(), + getASTContext(), VariantRef, &TI, + const_cast(AdjustArgsNothing.data()), AdjustArgsNothing.size(), const_cast(AdjustArgsNeedDevicePtr.data()), AdjustArgsNeedDevicePtr.size(), const_cast(AppendArgs.data()), AppendArgs.size(), SR); FD->addAttr(NewAttr); } -StmtResult Sema::ActOnOpenMPParallelDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult +SemaOpenMP::ActOnOpenMPParallelDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -7972,11 +8005,11 @@ StmtResult Sema::ActOnOpenMPParallelDirective(ArrayRef Clauses, // longjmp() and throw() must not violate the entry/exit criteria. CS->getCapturedDecl()->setNothrow(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - return OMPParallelDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt, - DSAStack->getTaskgroupReductionRef(), - DSAStack->isCancelRegion()); + return OMPParallelDirective::Create( + getASTContext(), StartLoc, EndLoc, Clauses, AStmt, + DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion()); } namespace { @@ -8226,7 +8259,7 @@ bool OpenMPIterationSpaceChecker::setStep(Expr *NewStep, bool Subtract) { if (!NewStep->isValueDependent()) { // Check that the step is integer expression. SourceLocation StepLoc = NewStep->getBeginLoc(); - ExprResult Val = SemaRef.PerformOpenMPImplicitIntegerConversion( + ExprResult Val = SemaRef.OpenMP().PerformOpenMPImplicitIntegerConversion( StepLoc, getExprAsWritten(NewStep)); if (Val.isInvalid()) return true; @@ -9248,7 +9281,7 @@ DeclRefExpr *OpenMPIterationSpaceChecker::buildCounterVar( DSAStackTy &DSA) const { auto *VD = dyn_cast(LCDecl); if (!VD) { - VD = SemaRef.isOpenMPCapturedDecl(LCDecl); + VD = SemaRef.OpenMP().isOpenMPCapturedDecl(LCDecl); DeclRefExpr *Ref = buildDeclRefExpr( SemaRef, VD, VD->getType().getNonReferenceType(), DefaultLoc); const DSAStackTy::DSAVarData Data = @@ -9321,14 +9354,15 @@ Expr *OpenMPIterationSpaceChecker::buildOrderedLoopData( } } // namespace -void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) { +void SemaOpenMP::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, + Stmt *Init) { assert(getLangOpts().OpenMP && "OpenMP is not active."); assert(Init && "Expected loop in canonical form."); unsigned AssociatedLoops = DSAStack->getAssociatedLoops(); if (AssociatedLoops > 0 && isOpenMPLoopDirective(DSAStack->getCurrentDirective())) { DSAStack->loopStart(); - OpenMPIterationSpaceChecker ISC(*this, /*SupportsNonRectangular=*/true, + OpenMPIterationSpaceChecker ISC(SemaRef, /*SupportsNonRectangular=*/true, *DSAStack, ForLoc); if (!ISC.checkAndSetInit(Init, /*EmitDiags=*/false)) { if (ValueDecl *D = ISC.getLoopDecl()) { @@ -9338,7 +9372,7 @@ void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) { if (VarDecl *Private = isOpenMPCapturedDecl(D)) { VD = Private; } else { - PrivateRef = buildCapture(*this, D, ISC.getLoopDeclRefExpr(), + PrivateRef = buildCapture(SemaRef, D, ISC.getLoopDeclRefExpr(), /*WithInit=*/false); VD = cast(PrivateRef->getDecl()); } @@ -9348,10 +9382,10 @@ void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) { if (LD != D->getCanonicalDecl()) { DSAStack->resetPossibleLoopCounter(); if (auto *Var = dyn_cast_or_null(LD)) - MarkDeclarationsReferencedInExpr( - buildDeclRefExpr(*this, const_cast(Var), - Var->getType().getNonLValueExprType(Context), - ForLoc, /*RefersToCapture=*/true)); + SemaRef.MarkDeclarationsReferencedInExpr(buildDeclRefExpr( + SemaRef, const_cast(Var), + Var->getType().getNonLValueExprType(getASTContext()), ForLoc, + /*RefersToCapture=*/true)); } OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); // OpenMP [2.14.1.1, Data-sharing Attribute Rules for Variables @@ -9372,8 +9406,8 @@ void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) { : OMPC_private; if (((isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown && DVar.CKind != PredeterminedCKind && DVar.RefExpr && - (LangOpts.OpenMP <= 45 || (DVar.CKind != OMPC_lastprivate && - DVar.CKind != OMPC_private))) || + (getLangOpts().OpenMP <= 45 || (DVar.CKind != OMPC_lastprivate && + DVar.CKind != OMPC_private))) || ((isOpenMPWorksharingDirective(DKind) || DKind == OMPD_taskloop || DKind == OMPD_master_taskloop || DKind == OMPD_masked_taskloop || DKind == OMPD_parallel_master_taskloop || @@ -9388,7 +9422,7 @@ void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) { << getOpenMPClauseName(PredeterminedCKind); if (DVar.RefExpr == nullptr) DVar.CKind = PredeterminedCKind; - reportOriginalDsa(*this, DSAStack, D, DVar, + reportOriginalDsa(SemaRef, DSAStack, D, DVar, /*IsLoopIterVar=*/true); } else if (LoopDeclRefExpr) { // Make the loop iteration variable private (for worksharing @@ -9428,7 +9462,7 @@ static bool checkOpenMPIterationSpace( unsigned CurrentNestedLoopCount, unsigned NestedLoopCount, unsigned TotalNestedLoopCount, Expr *CollapseLoopCountExpr, Expr *OrderedLoopCountExpr, - Sema::VarsWithInheritedDSAType &VarsWithImplicitDSA, + SemaOpenMP::VarsWithInheritedDSAType &VarsWithImplicitDSA, llvm::MutableArrayRef ResultIterSpaces, llvm::MapVector &Captures) { bool SupportsNonRectangular = !isOpenMPLoopTransformationDirective(DKind); @@ -9817,7 +9851,7 @@ static unsigned checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr, Expr *OrderedLoopCountExpr, Stmt *AStmt, Sema &SemaRef, DSAStackTy &DSA, - Sema::VarsWithInheritedDSAType &VarsWithImplicitDSA, + SemaOpenMP::VarsWithInheritedDSAType &VarsWithImplicitDSA, OMPLoopBasedDirective::HelperExprs &Built) { unsigned NestedLoopCount = 1; bool SupportsNonPerfectlyNested = (SemaRef.LangOpts.OpenMP >= 50) && @@ -10566,7 +10600,8 @@ static bool checkGenericLoopLastprivate(Sema &S, ArrayRef Clauses, OpenMPDirectiveKind K, DSAStackTy *Stack); -bool Sema::checkLastPrivateForMappedDirectives(ArrayRef Clauses) { +bool SemaOpenMP::checkLastPrivateForMappedDirectives( + ArrayRef Clauses) { // Check for syntax of lastprivate // Param of the lastprivate have different meanings in the mapped directives @@ -10574,16 +10609,15 @@ bool Sema::checkLastPrivateForMappedDirectives(ArrayRef Clauses) { // "omp for" lastprivate vars must be shared if (getLangOpts().OpenMP >= 50 && DSAStack->getMappedDirective() == OMPD_loop && - checkGenericLoopLastprivate(*this, Clauses, OMPD_loop, DSAStack)) { + checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_loop, DSAStack)) { return false; } return true; } -StmtResult -Sema::ActOnOpenMPSimdDirective(ArrayRef Clauses, Stmt *AStmt, - SourceLocation StartLoc, SourceLocation EndLoc, - VarsWithInheritedDSAType &VarsWithImplicitDSA) { +StmtResult SemaOpenMP::ActOnOpenMPSimdDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) return StmtError(); @@ -10596,38 +10630,37 @@ Sema::ActOnOpenMPSimdDirective(ArrayRef Clauses, Stmt *AStmt, // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_simd, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses), - AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); + AStmt, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp simd loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); auto *SimdDirective = OMPSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->getMappedDirective()); return SimdDirective; } -StmtResult -Sema::ActOnOpenMPForDirective(ArrayRef Clauses, Stmt *AStmt, - SourceLocation StartLoc, SourceLocation EndLoc, - VarsWithInheritedDSAType &VarsWithImplicitDSA) { +StmtResult SemaOpenMP::ActOnOpenMPForDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) return StmtError(); @@ -10640,32 +10673,32 @@ Sema::ActOnOpenMPForDirective(ArrayRef Clauses, Stmt *AStmt, // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_for, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses), - AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); + AStmt, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } auto *ForDirective = OMPForDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion(), DSAStack->getMappedDirective()); return ForDirective; } -StmtResult Sema::ActOnOpenMPForSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPForSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -10677,37 +10710,37 @@ StmtResult Sema::ActOnOpenMPForSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_for_simd, getCollapseNumberExpr(Clauses), - getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, + getOrderedNumberExpr(Clauses), AStmt, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for simd loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); - return OMPForSimdDirective::Create(Context, StartLoc, EndLoc, NestedLoopCount, - Clauses, AStmt, B); + SemaRef.setFunctionHasBranchProtectedScope(); + return OMPForSimdDirective::Create(getASTContext(), StartLoc, EndLoc, + NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPSectionsDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult +SemaOpenMP::ActOnOpenMPSectionsDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -10736,23 +10769,23 @@ StmtResult Sema::ActOnOpenMPSectionsDirective(ArrayRef Clauses, return StmtError(); } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - return OMPSectionsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt, - DSAStack->getTaskgroupReductionRef(), - DSAStack->isCancelRegion()); + return OMPSectionsDirective::Create( + getASTContext(), StartLoc, EndLoc, Clauses, AStmt, + DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPSectionDirective(Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPSectionDirective(Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); DSAStack->setParentCancelRegion(DSAStack->isCancelRegion()); - return OMPSectionDirective::Create(Context, StartLoc, EndLoc, AStmt, + return OMPSectionDirective::Create(getASTContext(), StartLoc, EndLoc, AStmt, DSAStack->isCancelRegion()); } @@ -10764,10 +10797,10 @@ static Expr *getDirectCallExpr(Expr *E) { return nullptr; } -StmtResult Sema::ActOnOpenMPDispatchDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult +SemaOpenMP::ActOnOpenMPDispatchDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -10780,7 +10813,7 @@ StmtResult Sema::ActOnOpenMPDispatchDirective(ArrayRef Clauses, SourceLocation TargetCallLoc; - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { Expr *TargetCall = nullptr; auto *E = dyn_cast(S); @@ -10808,10 +10841,10 @@ StmtResult Sema::ActOnOpenMPDispatchDirective(ArrayRef Clauses, TargetCallLoc = TargetCall->getExprLoc(); } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - return OMPDispatchDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt, - TargetCallLoc); + return OMPDispatchDirective::Create(getASTContext(), StartLoc, EndLoc, + Clauses, AStmt, TargetCallLoc); } static bool checkGenericLoopLastprivate(Sema &S, ArrayRef Clauses, @@ -10839,7 +10872,7 @@ static bool checkGenericLoopLastprivate(Sema &S, ArrayRef Clauses, return ErrorFound; } -StmtResult Sema::ActOnOpenMPGenericLoopDirective( +StmtResult SemaOpenMP::ActOnOpenMPGenericLoopDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -10848,7 +10881,7 @@ StmtResult Sema::ActOnOpenMPGenericLoopDirective( // OpenMP 5.1 [2.11.7, loop construct, Restrictions] // A list item may not appear in a lastprivate clause unless it is the // loop iteration variable of a loop that is associated with the construct. - if (checkGenericLoopLastprivate(*this, Clauses, OMPD_loop, DSAStack)) + if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_loop, DSAStack)) return StmtError(); auto *CS = cast(AStmt); @@ -10863,19 +10896,19 @@ StmtResult Sema::ActOnOpenMPGenericLoopDirective( // In presence of clause 'collapse', it will define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_loop, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses), - AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); + AStmt, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp loop exprs were not built"); - setFunctionHasBranchProtectedScope(); - return OMPGenericLoopDirective::Create(Context, StartLoc, EndLoc, + SemaRef.setFunctionHasBranchProtectedScope(); + return OMPGenericLoopDirective::Create(getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTeamsGenericLoopDirective( +StmtResult SemaOpenMP::ActOnOpenMPTeamsGenericLoopDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -10884,7 +10917,7 @@ StmtResult Sema::ActOnOpenMPTeamsGenericLoopDirective( // OpenMP 5.1 [2.11.7, loop construct, Restrictions] // A list item may not appear in a lastprivate clause unless it is the // loop iteration variable of a loop that is associated with the construct. - if (checkGenericLoopLastprivate(*this, Clauses, OMPD_teams_loop, DSAStack)) + if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_teams_loop, DSAStack)) return StmtError(); auto *CS = cast(AStmt); @@ -10909,22 +10942,22 @@ StmtResult Sema::ActOnOpenMPTeamsGenericLoopDirective( // In presence of clause 'collapse', it will define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_teams_loop, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack, + /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp loop exprs were not built"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); DSAStack->setParentTeamsRegionLoc(StartLoc); return OMPTeamsGenericLoopDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTargetTeamsGenericLoopDirective( +StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsGenericLoopDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -10933,7 +10966,7 @@ StmtResult Sema::ActOnOpenMPTargetTeamsGenericLoopDirective( // OpenMP 5.1 [2.11.7, loop construct, Restrictions] // A list item may not appear in a lastprivate clause unless it is the // loop iteration variable of a loop that is associated with the construct. - if (checkGenericLoopLastprivate(*this, Clauses, OMPD_target_teams_loop, + if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_target_teams_loop, DSAStack)) return StmtError(); @@ -10959,22 +10992,22 @@ StmtResult Sema::ActOnOpenMPTargetTeamsGenericLoopDirective( // In presence of clause 'collapse', it will define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_target_teams_loop, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack, + /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp loop exprs were not built"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPTargetTeamsGenericLoopDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, - teamsLoopCanBeParallelFor(AStmt, *this)); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + teamsLoopCanBeParallelFor(AStmt, SemaRef)); } -StmtResult Sema::ActOnOpenMPParallelGenericLoopDirective( +StmtResult SemaOpenMP::ActOnOpenMPParallelGenericLoopDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -10983,7 +11016,8 @@ StmtResult Sema::ActOnOpenMPParallelGenericLoopDirective( // OpenMP 5.1 [2.11.7, loop construct, Restrictions] // A list item may not appear in a lastprivate clause unless it is the // loop iteration variable of a loop that is associated with the construct. - if (checkGenericLoopLastprivate(*this, Clauses, OMPD_parallel_loop, DSAStack)) + if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_parallel_loop, + DSAStack)) return StmtError(); auto *CS = cast(AStmt); @@ -11008,21 +11042,21 @@ StmtResult Sema::ActOnOpenMPParallelGenericLoopDirective( // In presence of clause 'collapse', it will define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_parallel_loop, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack, + /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp loop exprs were not built"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPParallelGenericLoopDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTargetParallelGenericLoopDirective( +StmtResult SemaOpenMP::ActOnOpenMPTargetParallelGenericLoopDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -11031,7 +11065,7 @@ StmtResult Sema::ActOnOpenMPTargetParallelGenericLoopDirective( // OpenMP 5.1 [2.11.7, loop construct, Restrictions] // A list item may not appear in a lastprivate clause unless it is the // loop iteration variable of a loop that is associated with the construct. - if (checkGenericLoopLastprivate(*this, Clauses, OMPD_target_parallel_loop, + if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_target_parallel_loop, DSAStack)) return StmtError(); @@ -11057,30 +11091,30 @@ StmtResult Sema::ActOnOpenMPTargetParallelGenericLoopDirective( // In presence of clause 'collapse', it will define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_target_parallel_loop, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack, + /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp loop exprs were not built"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPTargetParallelGenericLoopDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPSingleDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPSingleDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); assert(isa(AStmt) && "Captured statement expected"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); // OpenMP [2.7.3, single Construct, Restrictions] // The copyprivate clause must not be used with the nowait clause. @@ -11099,33 +11133,35 @@ StmtResult Sema::ActOnOpenMPSingleDirective(ArrayRef Clauses, } } - return OMPSingleDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); + return OMPSingleDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses, + AStmt); } -StmtResult Sema::ActOnOpenMPMasterDirective(Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPMasterDirective(Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - return OMPMasterDirective::Create(Context, StartLoc, EndLoc, AStmt); + return OMPMasterDirective::Create(getASTContext(), StartLoc, EndLoc, AStmt); } -StmtResult Sema::ActOnOpenMPMaskedDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPMaskedDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - return OMPMaskedDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); + return OMPMaskedDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses, + AStmt); } -StmtResult Sema::ActOnOpenMPCriticalDirective( +StmtResult SemaOpenMP::ActOnOpenMPCriticalDirective( const DeclarationNameInfo &DirName, ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc) { if (!AStmt) @@ -11146,7 +11182,7 @@ StmtResult Sema::ActOnOpenMPCriticalDirective( E->isInstantiationDependent()) { DependentHint = true; } else { - Hint = E->EvaluateKnownConstInt(Context); + Hint = E->EvaluateKnownConstInt(getASTContext()); HintLoc = C->getBeginLoc(); } } @@ -11165,7 +11201,7 @@ StmtResult Sema::ActOnOpenMPCriticalDirective( if (const auto *C = Pair.first->getSingleClause()) { Diag(C->getBeginLoc(), diag::note_omp_critical_hint_here) << 1 - << toString(C->getHint()->EvaluateKnownConstInt(Context), + << toString(C->getHint()->EvaluateKnownConstInt(getASTContext()), /*Radix=*/10, /*Signed=*/false); } else { Diag(Pair.first->getBeginLoc(), diag::note_omp_critical_no_hint) << 1; @@ -11173,16 +11209,16 @@ StmtResult Sema::ActOnOpenMPCriticalDirective( } } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - auto *Dir = OMPCriticalDirective::Create(Context, DirName, StartLoc, EndLoc, - Clauses, AStmt); + auto *Dir = OMPCriticalDirective::Create(getASTContext(), DirName, StartLoc, + EndLoc, Clauses, AStmt); if (!Pair.first && DirName.getName() && !DependentHint) DSAStack->addCriticalWithHint(Dir, Hint); return Dir; } -StmtResult Sema::ActOnOpenMPParallelForDirective( +StmtResult SemaOpenMP::ActOnOpenMPParallelForDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -11201,32 +11237,32 @@ StmtResult Sema::ActOnOpenMPParallelForDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_parallel_for, getCollapseNumberExpr(Clauses), - getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, + getOrderedNumberExpr(Clauses), AStmt, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp parallel for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPParallelForDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPParallelForSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPParallelForSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -11245,34 +11281,33 @@ StmtResult Sema::ActOnOpenMPParallelForSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_parallel_for_simd, getCollapseNumberExpr(Clauses), - getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, + getOrderedNumberExpr(Clauses), AStmt, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPParallelForSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult -Sema::ActOnOpenMPParallelMasterDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPParallelMasterDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -11285,17 +11320,16 @@ Sema::ActOnOpenMPParallelMasterDirective(ArrayRef Clauses, // longjmp() and throw() must not violate the entry/exit criteria. CS->getCapturedDecl()->setNothrow(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPParallelMasterDirective::Create( - Context, StartLoc, EndLoc, Clauses, AStmt, + getASTContext(), StartLoc, EndLoc, Clauses, AStmt, DSAStack->getTaskgroupReductionRef()); } -StmtResult -Sema::ActOnOpenMPParallelMaskedDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -11308,17 +11342,16 @@ Sema::ActOnOpenMPParallelMaskedDirective(ArrayRef Clauses, // longjmp() and throw() must not violate the entry/exit criteria. CS->getCapturedDecl()->setNothrow(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPParallelMaskedDirective::Create( - Context, StartLoc, EndLoc, Clauses, AStmt, + getASTContext(), StartLoc, EndLoc, Clauses, AStmt, DSAStack->getTaskgroupReductionRef()); } -StmtResult -Sema::ActOnOpenMPParallelSectionsDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPParallelSectionsDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -11348,10 +11381,10 @@ Sema::ActOnOpenMPParallelSectionsDirective(ArrayRef Clauses, return StmtError(); } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPParallelSectionsDirective::Create( - Context, StartLoc, EndLoc, Clauses, AStmt, + getASTContext(), StartLoc, EndLoc, Clauses, AStmt, DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion()); } @@ -11378,16 +11411,17 @@ static bool checkMutuallyExclusiveClauses( return ErrorFound; } -StmtResult Sema::ActOnOpenMPTaskDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPTaskDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); // OpenMP 5.0, 2.10.1 task Construct // If a detach clause appears on the directive, then a mergeable clause cannot // appear on the same directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_detach, OMPC_mergeable})) return StmtError(); @@ -11399,26 +11433,26 @@ StmtResult Sema::ActOnOpenMPTaskDirective(ArrayRef Clauses, // longjmp() and throw() must not violate the entry/exit criteria. CS->getCapturedDecl()->setNothrow(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - return OMPTaskDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt, - DSAStack->isCancelRegion()); + return OMPTaskDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses, + AStmt, DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPTaskyieldDirective(SourceLocation StartLoc, - SourceLocation EndLoc) { - return OMPTaskyieldDirective::Create(Context, StartLoc, EndLoc); +StmtResult SemaOpenMP::ActOnOpenMPTaskyieldDirective(SourceLocation StartLoc, + SourceLocation EndLoc) { + return OMPTaskyieldDirective::Create(getASTContext(), StartLoc, EndLoc); } -StmtResult Sema::ActOnOpenMPBarrierDirective(SourceLocation StartLoc, - SourceLocation EndLoc) { - return OMPBarrierDirective::Create(Context, StartLoc, EndLoc); +StmtResult SemaOpenMP::ActOnOpenMPBarrierDirective(SourceLocation StartLoc, + SourceLocation EndLoc) { + return OMPBarrierDirective::Create(getASTContext(), StartLoc, EndLoc); } -StmtResult Sema::ActOnOpenMPErrorDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc, - bool InExContext) { +StmtResult SemaOpenMP::ActOnOpenMPErrorDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc, + bool InExContext) { const OMPAtClause *AtC = OMPExecutableDirective::getSingleClause(Clauses); @@ -11443,12 +11477,13 @@ StmtResult Sema::ActOnOpenMPErrorDirective(ArrayRef Clauses, if (!SeverityC || SeverityC->getSeverityKind() != OMPC_SEVERITY_warning) return StmtError(); } - return OMPErrorDirective::Create(Context, StartLoc, EndLoc, Clauses); + return OMPErrorDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses); } -StmtResult Sema::ActOnOpenMPTaskwaitDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult +SemaOpenMP::ActOnOpenMPTaskwaitDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc) { const OMPNowaitClause *NowaitC = OMPExecutableDirective::getSingleClause(Clauses); bool HasDependC = @@ -11459,28 +11494,29 @@ StmtResult Sema::ActOnOpenMPTaskwaitDirective(ArrayRef Clauses, return StmtError(); } - return OMPTaskwaitDirective::Create(Context, StartLoc, EndLoc, Clauses); + return OMPTaskwaitDirective::Create(getASTContext(), StartLoc, EndLoc, + Clauses); } -StmtResult Sema::ActOnOpenMPTaskgroupDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult +SemaOpenMP::ActOnOpenMPTaskgroupDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); assert(isa(AStmt) && "Captured statement expected"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - return OMPTaskgroupDirective::Create(Context, StartLoc, EndLoc, Clauses, - AStmt, + return OMPTaskgroupDirective::Create(getASTContext(), StartLoc, EndLoc, + Clauses, AStmt, DSAStack->getTaskgroupReductionRef()); } -StmtResult Sema::ActOnOpenMPFlushDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPFlushDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc) { OMPFlushClause *FC = nullptr; OMPClause *OrderClause = nullptr; for (OMPClause *C : Clauses) { @@ -11514,12 +11550,12 @@ StmtResult Sema::ActOnOpenMPFlushDirective(ArrayRef Clauses, << getOpenMPClauseName(OrderClause->getClauseKind()); return StmtError(); } - return OMPFlushDirective::Create(Context, StartLoc, EndLoc, Clauses); + return OMPFlushDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses); } -StmtResult Sema::ActOnOpenMPDepobjDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPDepobjDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc) { if (Clauses.empty()) { Diag(StartLoc, diag::err_omp_depobj_expected); return StmtError(); @@ -11536,12 +11572,12 @@ StmtResult Sema::ActOnOpenMPDepobjDirective(ArrayRef Clauses, Diag(Clauses[0]->getEndLoc(), diag::err_omp_depobj_single_clause_expected); return StmtError(); } - return OMPDepobjDirective::Create(Context, StartLoc, EndLoc, Clauses); + return OMPDepobjDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses); } -StmtResult Sema::ActOnOpenMPScanDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPScanDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc) { // Check that exactly one clause is specified. if (Clauses.size() != 1) { Diag(Clauses.empty() ? EndLoc : Clauses[1]->getBeginLoc(), @@ -11566,13 +11602,13 @@ StmtResult Sema::ActOnOpenMPScanDirective(ArrayRef Clauses, return StmtError(); } DSAStack->setParentHasScanDirective(StartLoc); - return OMPScanDirective::Create(Context, StartLoc, EndLoc, Clauses); + return OMPScanDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses); } -StmtResult Sema::ActOnOpenMPOrderedDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult +SemaOpenMP::ActOnOpenMPOrderedDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { const OMPClause *DependFound = nullptr; const OMPClause *DependSourceClause = nullptr; const OMPClause *DependSinkClause = nullptr; @@ -11631,7 +11667,7 @@ StmtResult Sema::ActOnOpenMPOrderedDirective(ArrayRef Clauses, // An ordered construct with the simd clause is the only OpenMP construct // that can appear in the simd region. Diag(StartLoc, diag::err_omp_prohibited_region_simd) - << (LangOpts.OpenMP >= 50 ? 1 : 0); + << (getLangOpts().OpenMP >= 50 ? 1 : 0); ErrorFound = true; } else if ((DependFound || DoacrossFound) && (TC || SC)) { SourceLocation Loc = @@ -11678,10 +11714,11 @@ StmtResult Sema::ActOnOpenMPOrderedDirective(ArrayRef Clauses, if (AStmt) { assert(isa(AStmt) && "Captured statement expected"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); } - return OMPOrderedDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); + return OMPOrderedDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses, + AStmt); } namespace { @@ -12739,10 +12776,11 @@ bool OpenMPAtomicCompareCaptureChecker::checkStmt(Stmt *S, } } // namespace -StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPAtomicDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { + ASTContext &Context = getASTContext(); // Register location of the first atomic directive. DSAStack->addAtomicDirectiveLoc(StartLoc); if (!AStmt) @@ -12945,7 +12983,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, << ErrorFound << NoteRange; return StmtError(); } - if (CurContext->isDependentContext()) + if (SemaRef.CurContext->isDependentContext()) V = X = nullptr; } else if (AtomicKind == OMPC_write) { enum { @@ -13007,7 +13045,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, << ErrorFound << NoteRange; return StmtError(); } - if (CurContext->isDependentContext()) + if (SemaRef.CurContext->isDependentContext()) E = X = nullptr; } else if (AtomicKind == OMPC_update || AtomicKind == OMPC_unknown) { // If clause is update: @@ -13018,7 +13056,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, // x binop= expr; // x = x binop expr; // x = expr binop x; - OpenMPAtomicUpdateChecker Checker(*this); + OpenMPAtomicUpdateChecker Checker(SemaRef); if (Checker.checkStatement( Body, (AtomicKind == OMPC_update) @@ -13026,7 +13064,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, : diag::err_omp_atomic_not_expression_statement, diag::note_omp_atomic_update)) return StmtError(); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { E = Checker.getExpr(); X = Checker.getX(); UE = Checker.getUpdateExpr(); @@ -13056,7 +13094,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, if (AtomicBinOp && AtomicBinOp->getOpcode() == BO_Assign) { V = AtomicBinOp->getLHS(); Body = AtomicBinOp->getRHS()->IgnoreParenImpCasts(); - OpenMPAtomicUpdateChecker Checker(*this); + OpenMPAtomicUpdateChecker Checker(SemaRef); if (Checker.checkStatement( Body, diag::err_omp_atomic_capture_not_expression_statement, diag::note_omp_atomic_update)) @@ -13081,7 +13119,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, Diag(NoteLoc, diag::note_omp_atomic_capture) << ErrorFound << NoteRange; return StmtError(); } - if (CurContext->isDependentContext()) + if (SemaRef.CurContext->isDependentContext()) UE = V = E = X = nullptr; } else { // If clause is a capture: @@ -13110,14 +13148,14 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, if (auto *EWC = dyn_cast(Second)) Second = EWC->getSubExpr()->IgnoreParenImpCasts(); // Need to find what subexpression is 'v' and what is 'x'. - OpenMPAtomicUpdateChecker Checker(*this); + OpenMPAtomicUpdateChecker Checker(SemaRef); bool IsUpdateExprFound = !Checker.checkStatement(Second); BinaryOperator *BinOp = nullptr; if (IsUpdateExprFound) { BinOp = dyn_cast(First); IsUpdateExprFound = BinOp && BinOp->getOpcode() == BO_Assign; } - if (IsUpdateExprFound && !CurContext->isDependentContext()) { + if (IsUpdateExprFound && !SemaRef.CurContext->isDependentContext()) { // { v = x; x++; } // { v = x; x--; } // { v = x; ++x; } @@ -13147,7 +13185,8 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, BinOp = dyn_cast(Second); IsUpdateExprFound = BinOp && BinOp->getOpcode() == BO_Assign; } - if (IsUpdateExprFound && !CurContext->isDependentContext()) { + if (IsUpdateExprFound && + !SemaRef.CurContext->isDependentContext()) { // { x++; v = x; } // { x--; v = x; } // { ++x; v = x; } @@ -13244,12 +13283,12 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, Diag(NoteLoc, diag::note_omp_atomic_capture) << ErrorFound << NoteRange; return StmtError(); } - if (CurContext->isDependentContext()) + if (SemaRef.CurContext->isDependentContext()) UE = V = E = X = nullptr; } else if (AtomicKind == OMPC_compare) { if (IsCompareCapture) { OpenMPAtomicCompareCaptureChecker::ErrorInfoTy ErrorInfo; - OpenMPAtomicCompareCaptureChecker Checker(*this); + OpenMPAtomicCompareCaptureChecker Checker(SemaRef); if (!Checker.checkStmt(Body, ErrorInfo)) { Diag(ErrorInfo.ErrorLoc, diag::err_omp_atomic_compare_capture) << ErrorInfo.ErrorRange; @@ -13269,7 +13308,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, IsPostfixUpdate = Checker.isPostfixUpdate(); } else { OpenMPAtomicCompareChecker::ErrorInfoTy ErrorInfo; - OpenMPAtomicCompareChecker Checker(*this); + OpenMPAtomicCompareChecker Checker(SemaRef); if (!Checker.checkStmt(Body, ErrorInfo)) { Diag(ErrorInfo.ErrorLoc, diag::err_omp_atomic_compare) << ErrorInfo.ErrorRange; @@ -13307,17 +13346,17 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef Clauses, } } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPAtomicDirective::Create( Context, StartLoc, EndLoc, Clauses, AStmt, {X, V, R, E, UE, D, CE, IsXLHSInRHSPart, IsPostfixUpdate, IsFailOnly}); } -StmtResult Sema::ActOnOpenMPTargetDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPTargetDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -13374,15 +13413,15 @@ StmtResult Sema::ActOnOpenMPTargetDirective(ArrayRef Clauses, } } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - return OMPTargetDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); + return OMPTargetDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses, + AStmt); } -StmtResult -Sema::ActOnOpenMPTargetParallelDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPTargetParallelDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -13404,14 +13443,14 @@ Sema::ActOnOpenMPTargetParallelDirective(ArrayRef Clauses, CS->getCapturedDecl()->setNothrow(); } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPTargetParallelDirective::Create( - Context, StartLoc, EndLoc, Clauses, AStmt, + getASTContext(), StartLoc, EndLoc, Clauses, AStmt, DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPTargetParallelForDirective( +StmtResult SemaOpenMP::ActOnOpenMPTargetParallelForDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -13440,28 +13479,28 @@ StmtResult Sema::ActOnOpenMPTargetParallelForDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_target_parallel_for, getCollapseNumberExpr(Clauses), - getOrderedNumberExpr(Clauses), CS, *this, *DSAStack, + getOrderedNumberExpr(Clauses), CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp target parallel for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPTargetParallelForDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion()); } @@ -13498,10 +13537,10 @@ static bool isClauseMappable(ArrayRef Clauses) { return true; } -StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult +SemaOpenMP::ActOnOpenMPTargetDataDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -13511,9 +13550,10 @@ StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef Clauses, // At least one map, use_device_addr or use_device_ptr clause must appear on // the directive. if (!hasClauses(Clauses, OMPC_map, OMPC_use_device_ptr) && - (LangOpts.OpenMP < 50 || !hasClauses(Clauses, OMPC_use_device_addr))) { + (getLangOpts().OpenMP < 50 || + !hasClauses(Clauses, OMPC_use_device_addr))) { StringRef Expected; - if (LangOpts.OpenMP < 50) + if (getLangOpts().OpenMP < 50) Expected = "'map' or 'use_device_ptr'"; else Expected = "'map', 'use_device_ptr', or 'use_device_addr'"; @@ -13522,16 +13562,15 @@ StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef Clauses, return StmtError(); } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - return OMPTargetDataDirective::Create(Context, StartLoc, EndLoc, Clauses, - AStmt); + return OMPTargetDataDirective::Create(getASTContext(), StartLoc, EndLoc, + Clauses, AStmt); } -StmtResult -Sema::ActOnOpenMPTargetEnterDataDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc, Stmt *AStmt) { +StmtResult SemaOpenMP::ActOnOpenMPTargetEnterDataDirective( + ArrayRef Clauses, SourceLocation StartLoc, + SourceLocation EndLoc, Stmt *AStmt) { if (!AStmt) return StmtError(); @@ -13561,14 +13600,13 @@ Sema::ActOnOpenMPTargetEnterDataDirective(ArrayRef Clauses, return StmtError(); } - return OMPTargetEnterDataDirective::Create(Context, StartLoc, EndLoc, Clauses, - AStmt); + return OMPTargetEnterDataDirective::Create(getASTContext(), StartLoc, EndLoc, + Clauses, AStmt); } -StmtResult -Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc, Stmt *AStmt) { +StmtResult SemaOpenMP::ActOnOpenMPTargetExitDataDirective( + ArrayRef Clauses, SourceLocation StartLoc, + SourceLocation EndLoc, Stmt *AStmt) { if (!AStmt) return StmtError(); @@ -13598,14 +13636,13 @@ Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef Clauses, return StmtError(); } - return OMPTargetExitDataDirective::Create(Context, StartLoc, EndLoc, Clauses, - AStmt); + return OMPTargetExitDataDirective::Create(getASTContext(), StartLoc, EndLoc, + Clauses, AStmt); } -StmtResult Sema::ActOnOpenMPTargetUpdateDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc, - Stmt *AStmt) { +StmtResult SemaOpenMP::ActOnOpenMPTargetUpdateDirective( + ArrayRef Clauses, SourceLocation StartLoc, + SourceLocation EndLoc, Stmt *AStmt) { if (!AStmt) return StmtError(); @@ -13637,13 +13674,14 @@ StmtResult Sema::ActOnOpenMPTargetUpdateDirective(ArrayRef Clauses, return StmtError(); } - return OMPTargetUpdateDirective::Create(Context, StartLoc, EndLoc, Clauses, - AStmt); + return OMPTargetUpdateDirective::Create(getASTContext(), StartLoc, EndLoc, + Clauses, AStmt); } -StmtResult Sema::ActOnOpenMPTeamsDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPTeamsDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -13659,17 +13697,17 @@ StmtResult Sema::ActOnOpenMPTeamsDirective(ArrayRef Clauses, // longjmp() and throw() must not violate the entry/exit criteria. CS->getCapturedDecl()->setNothrow(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); DSAStack->setParentTeamsRegionLoc(StartLoc); - return OMPTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); + return OMPTeamsDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses, + AStmt); } -StmtResult -Sema::ActOnOpenMPCancellationPointDirective(SourceLocation StartLoc, - SourceLocation EndLoc, - OpenMPDirectiveKind CancelRegion) { +StmtResult SemaOpenMP::ActOnOpenMPCancellationPointDirective( + SourceLocation StartLoc, SourceLocation EndLoc, + OpenMPDirectiveKind CancelRegion) { if (DSAStack->isParentNowaitRegion()) { Diag(StartLoc, diag::err_omp_parent_cancel_region_nowait) << 0; return StmtError(); @@ -13678,14 +13716,13 @@ Sema::ActOnOpenMPCancellationPointDirective(SourceLocation StartLoc, Diag(StartLoc, diag::err_omp_parent_cancel_region_ordered) << 0; return StmtError(); } - return OMPCancellationPointDirective::Create(Context, StartLoc, EndLoc, - CancelRegion); + return OMPCancellationPointDirective::Create(getASTContext(), StartLoc, + EndLoc, CancelRegion); } -StmtResult Sema::ActOnOpenMPCancelDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc, - OpenMPDirectiveKind CancelRegion) { +StmtResult SemaOpenMP::ActOnOpenMPCancelDirective( + ArrayRef Clauses, SourceLocation StartLoc, + SourceLocation EndLoc, OpenMPDirectiveKind CancelRegion) { if (DSAStack->isParentNowaitRegion()) { Diag(StartLoc, diag::err_omp_parent_cancel_region_nowait) << 1; return StmtError(); @@ -13695,7 +13732,7 @@ StmtResult Sema::ActOnOpenMPCancelDirective(ArrayRef Clauses, return StmtError(); } DSAStack->setParentCancelRegion(/*Cancel=*/true); - return OMPCancelDirective::Create(Context, StartLoc, EndLoc, Clauses, + return OMPCancelDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses, CancelRegion); } @@ -13726,7 +13763,7 @@ static bool checkReductionClauseWithNogroup(Sema &S, return false; } -StmtResult Sema::ActOnOpenMPTaskLoopDirective( +StmtResult SemaOpenMP::ActOnOpenMPTaskLoopDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -13738,33 +13775,33 @@ StmtResult Sema::ActOnOpenMPTaskLoopDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_taskloop, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef, + *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup // clause must not be specified. - if (checkReductionClauseWithNogroup(*this, Clauses)) + if (checkReductionClauseWithNogroup(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); - return OMPTaskLoopDirective::Create(Context, StartLoc, EndLoc, + SemaRef.setFunctionHasBranchProtectedScope(); + return OMPTaskLoopDirective::Create(getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPTaskLoopSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPTaskLoopSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -13776,21 +13813,21 @@ StmtResult Sema::ActOnOpenMPTaskLoopSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_taskloop_simd, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef, + *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } @@ -13798,23 +13835,23 @@ StmtResult Sema::ActOnOpenMPTaskLoopSimdDirective( // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup // clause must not be specified. - if (checkReductionClauseWithNogroup(*this, Clauses)) + if (checkReductionClauseWithNogroup(SemaRef, Clauses)) return StmtError(); - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); - return OMPTaskLoopSimdDirective::Create(Context, StartLoc, EndLoc, + SemaRef.setFunctionHasBranchProtectedScope(); + return OMPTaskLoopSimdDirective::Create(getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPMasterTaskLoopDirective( +StmtResult SemaOpenMP::ActOnOpenMPMasterTaskLoopDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -13826,33 +13863,33 @@ StmtResult Sema::ActOnOpenMPMasterTaskLoopDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_master_taskloop, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef, + *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup // clause must not be specified. - if (checkReductionClauseWithNogroup(*this, Clauses)) + if (checkReductionClauseWithNogroup(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); - return OMPMasterTaskLoopDirective::Create(Context, StartLoc, EndLoc, + SemaRef.setFunctionHasBranchProtectedScope(); + return OMPMasterTaskLoopDirective::Create(getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPMaskedTaskLoopDirective( +StmtResult SemaOpenMP::ActOnOpenMPMaskedTaskLoopDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -13864,33 +13901,33 @@ StmtResult Sema::ActOnOpenMPMaskedTaskLoopDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_masked_taskloop, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef, + *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup // clause must not be specified. - if (checkReductionClauseWithNogroup(*this, Clauses)) + if (checkReductionClauseWithNogroup(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); - return OMPMaskedTaskLoopDirective::Create(Context, StartLoc, EndLoc, + SemaRef.setFunctionHasBranchProtectedScope(); + return OMPMaskedTaskLoopDirective::Create(getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPMasterTaskLoopSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPMasterTaskLoopSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -13902,21 +13939,21 @@ StmtResult Sema::ActOnOpenMPMasterTaskLoopSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_master_taskloop_simd, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef, + *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } @@ -13924,23 +13961,23 @@ StmtResult Sema::ActOnOpenMPMasterTaskLoopSimdDirective( // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup // clause must not be specified. - if (checkReductionClauseWithNogroup(*this, Clauses)) + if (checkReductionClauseWithNogroup(SemaRef, Clauses)) return StmtError(); - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPMasterTaskLoopSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPMaskedTaskLoopSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPMaskedTaskLoopSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -13952,21 +13989,21 @@ StmtResult Sema::ActOnOpenMPMaskedTaskLoopSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_masked_taskloop_simd, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef, + *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } @@ -13974,23 +14011,23 @@ StmtResult Sema::ActOnOpenMPMaskedTaskLoopSimdDirective( // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup // clause must not be specified. - if (checkReductionClauseWithNogroup(*this, Clauses)) + if (checkReductionClauseWithNogroup(SemaRef, Clauses)) return StmtError(); - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPMaskedTaskLoopSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopDirective( +StmtResult SemaOpenMP::ActOnOpenMPParallelMasterTaskLoopDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14021,33 +14058,33 @@ StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_parallel_master_taskloop, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack, + /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup // clause must not be specified. - if (checkReductionClauseWithNogroup(*this, Clauses)) + if (checkReductionClauseWithNogroup(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPParallelMasterTaskLoopDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPParallelMaskedTaskLoopDirective( +StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedTaskLoopDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14078,33 +14115,33 @@ StmtResult Sema::ActOnOpenMPParallelMaskedTaskLoopDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_parallel_masked_taskloop, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack, + /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup // clause must not be specified. - if (checkReductionClauseWithNogroup(*this, Clauses)) + if (checkReductionClauseWithNogroup(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPParallelMaskedTaskLoopDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPParallelMasterTaskLoopSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14135,21 +14172,21 @@ StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_parallel_master_taskloop_simd, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack, + /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } @@ -14157,23 +14194,23 @@ StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopSimdDirective( // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup // clause must not be specified. - if (checkReductionClauseWithNogroup(*this, Clauses)) + if (checkReductionClauseWithNogroup(SemaRef, Clauses)) return StmtError(); - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPParallelMasterTaskLoopSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPParallelMaskedTaskLoopSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedTaskLoopSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14204,21 +14241,21 @@ StmtResult Sema::ActOnOpenMPParallelMaskedTaskLoopSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_parallel_masked_taskloop_simd, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack, + /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } @@ -14226,23 +14263,23 @@ StmtResult Sema::ActOnOpenMPParallelMaskedTaskLoopSimdDirective( // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkMutuallyExclusiveClauses(*this, Clauses, + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup // clause must not be specified. - if (checkReductionClauseWithNogroup(*this, Clauses)) + if (checkReductionClauseWithNogroup(SemaRef, Clauses)) return StmtError(); - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPParallelMaskedTaskLoopSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPDistributeDirective( +StmtResult SemaOpenMP::ActOnOpenMPDistributeDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14258,21 +14295,21 @@ StmtResult Sema::ActOnOpenMPDistributeDirective( unsigned NestedLoopCount = checkOpenMPLoop(OMPD_distribute, getCollapseNumberExpr(Clauses), nullptr /*ordered not a clause on distribute*/, AStmt, - *this, *DSAStack, VarsWithImplicitDSA, B); + SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); auto *DistributeDirective = OMPDistributeDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->getMappedDirective()); return DistributeDirective; } -StmtResult Sema::ActOnOpenMPDistributeParallelForDirective( +StmtResult SemaOpenMP::ActOnOpenMPDistributeParallelForDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14302,21 +14339,21 @@ StmtResult Sema::ActOnOpenMPDistributeParallelForDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_distribute_parallel_for, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack, + nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPDistributeParallelForDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPDistributeParallelForSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPDistributeParallelForSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14346,34 +14383,34 @@ StmtResult Sema::ActOnOpenMPDistributeParallelForSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack, + nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPDistributeParallelForSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPDistributeSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPDistributeSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14402,34 +14439,34 @@ StmtResult Sema::ActOnOpenMPDistributeSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_distribute_simd, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, - *DSAStack, VarsWithImplicitDSA, B); + nullptr /*ordered not a clause on distribute*/, CS, + SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); - return OMPDistributeSimdDirective::Create(Context, StartLoc, EndLoc, + SemaRef.setFunctionHasBranchProtectedScope(); + return OMPDistributeSimdDirective::Create(getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTargetParallelForSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPTargetParallelForSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14459,33 +14496,33 @@ StmtResult Sema::ActOnOpenMPTargetParallelForSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_target_parallel_for_simd, getCollapseNumberExpr(Clauses), - getOrderedNumberExpr(Clauses), CS, *this, *DSAStack, VarsWithImplicitDSA, - B); + getOrderedNumberExpr(Clauses), CS, SemaRef, *DSAStack, + VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp target parallel for simd loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPTargetParallelForSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTargetSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPTargetSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14514,34 +14551,34 @@ StmtResult Sema::ActOnOpenMPTargetSimdDirective( // nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_target_simd, getCollapseNumberExpr(Clauses), - getOrderedNumberExpr(Clauses), CS, *this, *DSAStack, + getOrderedNumberExpr(Clauses), CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp target simd loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); - return OMPTargetSimdDirective::Create(Context, StartLoc, EndLoc, + SemaRef.setFunctionHasBranchProtectedScope(); + return OMPTargetSimdDirective::Create(getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTeamsDistributeDirective( +StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14570,23 +14607,23 @@ StmtResult Sema::ActOnOpenMPTeamsDistributeDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop(OMPD_teams_distribute, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, - *DSAStack, VarsWithImplicitDSA, B); + nullptr /*ordered not a clause on distribute*/, CS, + SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp teams distribute loop exprs were not built"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); DSAStack->setParentTeamsRegionLoc(StartLoc); return OMPTeamsDistributeDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTeamsDistributeSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14616,38 +14653,38 @@ StmtResult Sema::ActOnOpenMPTeamsDistributeSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_teams_distribute_simd, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack, + nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp teams distribute simd loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); DSAStack->setParentTeamsRegionLoc(StartLoc); return OMPTeamsDistributeSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeParallelForSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14678,38 +14715,38 @@ StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_teams_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack, + nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); DSAStack->setParentTeamsRegionLoc(StartLoc); return OMPTeamsDistributeParallelForSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForDirective( +StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeParallelForDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14740,28 +14777,27 @@ StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack, + nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp for loop exprs were not built"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); DSAStack->setParentTeamsRegionLoc(StartLoc); return OMPTeamsDistributeParallelForDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDirective( + ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); @@ -14783,7 +14819,7 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, // longjmp() and throw() must not violate the entry/exit criteria. CS->getCapturedDecl()->setNothrow(); } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); const OMPClause *BareClause = nullptr; bool HasThreadLimitAndNumTeamsClause = hasClauses(Clauses, OMPC_num_teams) && @@ -14798,11 +14834,11 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef Clauses, return StmtError(); } - return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses, - AStmt); + return OMPTargetTeamsDirective::Create(getASTContext(), StartLoc, EndLoc, + Clauses, AStmt); } -StmtResult Sema::ActOnOpenMPTargetTeamsDistributeDirective( +StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14832,20 +14868,20 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_target_teams_distribute, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack, + nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp target teams distribute loop exprs were not built"); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPTargetTeamsDistributeDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForDirective( +StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeParallelForDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14875,32 +14911,32 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_target_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack, + nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp target teams distribute parallel for loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPTargetTeamsDistributeParallelForDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B, DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion()); } -StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14931,35 +14967,35 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective( unsigned NestedLoopCount = checkOpenMPLoop(OMPD_target_teams_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, - *DSAStack, VarsWithImplicitDSA, B); + nullptr /*ordered not a clause on distribute*/, CS, + SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp target teams distribute parallel for simd loop exprs were not " "built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPTargetTeamsDistributeParallelForSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -StmtResult Sema::ActOnOpenMPTargetTeamsDistributeSimdDirective( +StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective( ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) { if (!AStmt) @@ -14989,34 +15025,34 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeSimdDirective( // define the nested loops number. unsigned NestedLoopCount = checkOpenMPLoop( OMPD_target_teams_distribute_simd, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack, + nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); - assert((CurContext->isDependentContext() || B.builtAll()) && + assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) && "omp target teams distribute simd loop exprs were not built"); - if (!CurContext->isDependentContext()) { + if (!SemaRef.CurContext->isDependentContext()) { // Finalize the clauses that need pre-built expressions for CodeGen. for (OMPClause *C : Clauses) { if (auto *LC = dyn_cast(C)) if (FinishOpenMPLinearClause(*LC, cast(B.IterationVarRef), - B.NumIterations, *this, CurScope, - DSAStack)) + B.NumIterations, SemaRef, + SemaRef.getCurScope(), DSAStack)) return StmtError(); } } - if (checkSimdlenSafelenSpecified(*this, Clauses)) + if (checkSimdlenSafelenSpecified(SemaRef, Clauses)) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); return OMPTargetTeamsDistributeSimdDirective::Create( - Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); + getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } -bool Sema::checkTransformableLoopNest( +bool SemaOpenMP::checkTransformableLoopNest( OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops, SmallVectorImpl &LoopHelpers, Stmt *&Body, @@ -15029,7 +15065,7 @@ bool Sema::checkTransformableLoopNest( Stmt *CurStmt) { VarsWithInheritedDSAType TmpDSA; unsigned SingleNumLoops = - checkOpenMPLoop(Kind, nullptr, nullptr, CurStmt, *this, *DSAStack, + checkOpenMPLoop(Kind, nullptr, nullptr, CurStmt, SemaRef, *DSAStack, TmpDSA, LoopHelpers[Cnt]); if (SingleNumLoops == 0) return true; @@ -15065,9 +15101,11 @@ bool Sema::checkTransformableLoopNest( return Result; } -StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { + ASTContext &Context = getASTContext(); auto SizesClauses = OMPExecutableDirective::getClausesOfKind(Clauses); if (SizesClauses.empty()) { @@ -15091,7 +15129,7 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef Clauses, return StmtError(); // Delay tiling to when template is completely instantiated. - if (CurContext->isDependentContext()) + if (SemaRef.CurContext->isDependentContext()) return OMPTileDirective::Create(Context, StartLoc, EndLoc, Clauses, NumLoops, AStmt, nullptr, nullptr); @@ -15117,7 +15155,7 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef Clauses, std::string FloorCntName = (Twine(".floor_") + llvm::utostr(I) + ".iv." + OrigVarName).str(); VarDecl *FloorCntDecl = - buildVarDecl(*this, {}, CntTy, FloorCntName, nullptr, OrigCntVar); + buildVarDecl(SemaRef, {}, CntTy, FloorCntName, nullptr, OrigCntVar); FloorIndVars[I] = FloorCntDecl; } @@ -15130,7 +15168,8 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef Clauses, // used by the expressions to derive the original iteration variable's // value from the logical iteration number. auto *TileCntDecl = cast(IterVarRef->getDecl()); - TileCntDecl->setDeclName(&PP.getIdentifierTable().get(TileCntName)); + TileCntDecl->setDeclName( + &SemaRef.PP.getIdentifierTable().get(TileCntName)); TileIndVars[I] = TileCntDecl; } for (auto &P : OriginalInits[I]) { @@ -15159,17 +15198,18 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef Clauses, auto *OrigCntVar = cast(LoopHelper.Counters[0]); QualType CntTy = OrigCntVar->getType(); Expr *DimTileSize = SizesClause->getSizesRefs()[I]; - Scope *CurScope = getCurScope(); + Scope *CurScope = SemaRef.getCurScope(); // Commonly used variables. - DeclRefExpr *TileIV = buildDeclRefExpr(*this, TileIndVars[I], CntTy, + DeclRefExpr *TileIV = buildDeclRefExpr(SemaRef, TileIndVars[I], CntTy, OrigCntVar->getExprLoc()); - DeclRefExpr *FloorIV = buildDeclRefExpr(*this, FloorIndVars[I], CntTy, + DeclRefExpr *FloorIV = buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy, OrigCntVar->getExprLoc()); // For init-statement: auto .tile.iv = .floor.iv - AddInitializerToDecl(TileIndVars[I], DefaultLvalueConversion(FloorIV).get(), - /*DirectInit=*/false); + SemaRef.AddInitializerToDecl(TileIndVars[I], + SemaRef.DefaultLvalueConversion(FloorIV).get(), + /*DirectInit=*/false); Decl *CounterDecl = TileIndVars[I]; StmtResult InitStmt = new (Context) DeclStmt(DeclGroupRef::Create(Context, &CounterDecl, 1), @@ -15179,28 +15219,29 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef Clauses, // For cond-expression: .tile.iv < min(.floor.iv + DimTileSize, // NumIterations) - ExprResult EndOfTile = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), - BO_Add, FloorIV, DimTileSize); + ExprResult EndOfTile = SemaRef.BuildBinOp( + CurScope, LoopHelper.Cond->getExprLoc(), BO_Add, FloorIV, DimTileSize); if (!EndOfTile.isUsable()) return StmtError(); ExprResult IsPartialTile = - BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, - NumIterations, EndOfTile.get()); + SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, + NumIterations, EndOfTile.get()); if (!IsPartialTile.isUsable()) return StmtError(); - ExprResult MinTileAndIterSpace = ActOnConditionalOp( + ExprResult MinTileAndIterSpace = SemaRef.ActOnConditionalOp( LoopHelper.Cond->getBeginLoc(), LoopHelper.Cond->getEndLoc(), IsPartialTile.get(), NumIterations, EndOfTile.get()); if (!MinTileAndIterSpace.isUsable()) return StmtError(); - ExprResult CondExpr = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), - BO_LT, TileIV, MinTileAndIterSpace.get()); + ExprResult CondExpr = + SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, + TileIV, MinTileAndIterSpace.get()); if (!CondExpr.isUsable()) return StmtError(); // For incr-statement: ++.tile.iv - ExprResult IncrStmt = - BuildUnaryOp(CurScope, LoopHelper.Inc->getExprLoc(), UO_PreInc, TileIV); + ExprResult IncrStmt = SemaRef.BuildUnaryOp( + CurScope, LoopHelper.Inc->getExprLoc(), UO_PreInc, TileIV); if (!IncrStmt.isUsable()) return StmtError(); @@ -15235,16 +15276,16 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef Clauses, DeclRefExpr *OrigCntVar = cast(LoopHelper.Counters[0]); QualType CntTy = OrigCntVar->getType(); Expr *DimTileSize = SizesClause->getSizesRefs()[I]; - Scope *CurScope = getCurScope(); + Scope *CurScope = SemaRef.getCurScope(); // Commonly used variables. - DeclRefExpr *FloorIV = buildDeclRefExpr(*this, FloorIndVars[I], CntTy, + DeclRefExpr *FloorIV = buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy, OrigCntVar->getExprLoc()); // For init-statement: auto .floor.iv = 0 - AddInitializerToDecl( + SemaRef.AddInitializerToDecl( FloorIndVars[I], - ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(), + SemaRef.ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(), /*DirectInit=*/false); Decl *CounterDecl = FloorIndVars[I]; StmtResult InitStmt = new (Context) @@ -15254,14 +15295,15 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef Clauses, return StmtError(); // For cond-expression: .floor.iv < NumIterations - ExprResult CondExpr = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), - BO_LT, FloorIV, NumIterations); + ExprResult CondExpr = SemaRef.BuildBinOp( + CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, FloorIV, NumIterations); if (!CondExpr.isUsable()) return StmtError(); // For incr-statement: .floor.iv += DimTileSize - ExprResult IncrStmt = BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(), - BO_AddAssign, FloorIV, DimTileSize); + ExprResult IncrStmt = + SemaRef.BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(), BO_AddAssign, + FloorIV, DimTileSize); if (!IncrStmt.isUsable()) return StmtError(); @@ -15276,15 +15318,18 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef Clauses, buildPreInits(Context, PreInits)); } -StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef Clauses, - Stmt *AStmt, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { + ASTContext &Context = getASTContext(); + Scope *CurScope = SemaRef.getCurScope(); // Empty statement should only be possible if there already was an error. if (!AStmt) return StmtError(); - if (checkMutuallyExclusiveClauses(*this, Clauses, {OMPC_partial, OMPC_full})) + if (checkMutuallyExclusiveClauses(SemaRef, Clauses, + {OMPC_partial, OMPC_full})) return StmtError(); const OMPFullClause *FullClause = @@ -15307,7 +15352,7 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef Clauses, unsigned NumGeneratedLoops = PartialClause ? 1 : 0; // Delay unrolling to when template is completely instantiated. - if (CurContext->isDependentContext()) + if (SemaRef.CurContext->isDependentContext()) return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt, NumGeneratedLoops, nullptr, nullptr); @@ -15412,8 +15457,8 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef Clauses, assert(Factor > 0 && "Expected positive unroll factor"); auto MakeFactorExpr = [this, Factor, IVTy, FactorLoc]() { return IntegerLiteral::Create( - Context, llvm::APInt(Context.getIntWidth(IVTy), Factor), IVTy, - FactorLoc); + getASTContext(), llvm::APInt(getASTContext().getIntWidth(IVTy), Factor), + IVTy, FactorLoc); }; // Iteration variable SourceLocations. @@ -15430,30 +15475,31 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef Clauses, // Create the iteration variable for the unrolled loop. VarDecl *OuterIVDecl = - buildVarDecl(*this, {}, IVTy, OuterIVName, nullptr, OrigVar); + buildVarDecl(SemaRef, {}, IVTy, OuterIVName, nullptr, OrigVar); auto MakeOuterRef = [this, OuterIVDecl, IVTy, OrigVarLoc]() { - return buildDeclRefExpr(*this, OuterIVDecl, IVTy, OrigVarLoc); + return buildDeclRefExpr(SemaRef, OuterIVDecl, IVTy, OrigVarLoc); }; // Iteration variable for the inner loop: Reuse the iteration variable created // by checkOpenMPLoop. auto *InnerIVDecl = cast(IterationVarRef->getDecl()); - InnerIVDecl->setDeclName(&PP.getIdentifierTable().get(InnerIVName)); + InnerIVDecl->setDeclName(&SemaRef.PP.getIdentifierTable().get(InnerIVName)); auto MakeInnerRef = [this, InnerIVDecl, IVTy, OrigVarLoc]() { - return buildDeclRefExpr(*this, InnerIVDecl, IVTy, OrigVarLoc); + return buildDeclRefExpr(SemaRef, InnerIVDecl, IVTy, OrigVarLoc); }; // Make a copy of the NumIterations expression for each use: By the AST // constraints, every expression object in a DeclContext must be unique. - CaptureVars CopyTransformer(*this); + CaptureVars CopyTransformer(SemaRef); auto MakeNumIterations = [&CopyTransformer, &LoopHelper]() -> Expr * { return AssertSuccess( CopyTransformer.TransformExpr(LoopHelper.NumIterations)); }; // Inner For init-statement: auto .unroll_inner.iv = .unrolled.iv - ExprResult LValueConv = DefaultLvalueConversion(MakeOuterRef()); - AddInitializerToDecl(InnerIVDecl, LValueConv.get(), /*DirectInit=*/false); + ExprResult LValueConv = SemaRef.DefaultLvalueConversion(MakeOuterRef()); + SemaRef.AddInitializerToDecl(InnerIVDecl, LValueConv.get(), + /*DirectInit=*/false); StmtResult InnerInit = new (Context) DeclStmt(DeclGroupRef(InnerIVDecl), OrigVarLocBegin, OrigVarLocEnd); if (!InnerInit.isUsable()) @@ -15466,28 +15512,30 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef Clauses, // \endcode // This conjunction of two conditions allows ScalarEvolution to derive the // maximum trip count of the inner loop. - ExprResult EndOfTile = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), - BO_Add, MakeOuterRef(), MakeFactorExpr()); + ExprResult EndOfTile = + SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_Add, + MakeOuterRef(), MakeFactorExpr()); if (!EndOfTile.isUsable()) return StmtError(); - ExprResult InnerCond1 = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), - BO_LT, MakeInnerRef(), EndOfTile.get()); + ExprResult InnerCond1 = + SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, + MakeInnerRef(), EndOfTile.get()); if (!InnerCond1.isUsable()) return StmtError(); ExprResult InnerCond2 = - BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, MakeInnerRef(), - MakeNumIterations()); + SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, + MakeInnerRef(), MakeNumIterations()); if (!InnerCond2.isUsable()) return StmtError(); ExprResult InnerCond = - BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LAnd, - InnerCond1.get(), InnerCond2.get()); + SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LAnd, + InnerCond1.get(), InnerCond2.get()); if (!InnerCond.isUsable()) return StmtError(); // Inner For incr-statement: ++.unroll_inner.iv - ExprResult InnerIncr = BuildUnaryOp(CurScope, LoopHelper.Inc->getExprLoc(), - UO_PreInc, MakeInnerRef()); + ExprResult InnerIncr = SemaRef.BuildUnaryOp( + CurScope, LoopHelper.Inc->getExprLoc(), UO_PreInc, MakeInnerRef()); if (!InnerIncr.isUsable()) return StmtError(); @@ -15496,7 +15544,7 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef Clauses, InnerBodyStmts.append(LoopHelper.Updates.begin(), LoopHelper.Updates.end()); InnerBodyStmts.push_back(Body); CompoundStmt *InnerBody = - CompoundStmt::Create(Context, InnerBodyStmts, FPOptionsOverride(), + CompoundStmt::Create(getASTContext(), InnerBodyStmts, FPOptionsOverride(), Body->getBeginLoc(), Body->getEndLoc()); ForStmt *InnerFor = new (Context) ForStmt(Context, InnerInit.get(), InnerCond.get(), nullptr, @@ -15518,12 +15566,13 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef Clauses, LoopHintAttr *UnrollHintAttr = LoopHintAttr::CreateImplicit(Context, LoopHintAttr::UnrollCount, LoopHintAttr::Numeric, MakeFactorExpr()); - AttributedStmt *InnerUnrolled = - AttributedStmt::Create(Context, StartLoc, {UnrollHintAttr}, InnerFor); + AttributedStmt *InnerUnrolled = AttributedStmt::Create( + getASTContext(), StartLoc, {UnrollHintAttr}, InnerFor); // Outer For init-statement: auto .unrolled.iv = 0 - AddInitializerToDecl( - OuterIVDecl, ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(), + SemaRef.AddInitializerToDecl( + OuterIVDecl, + SemaRef.ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(), /*DirectInit=*/false); StmtResult OuterInit = new (Context) DeclStmt(DeclGroupRef(OuterIVDecl), OrigVarLocBegin, OrigVarLocEnd); @@ -15532,15 +15581,15 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef Clauses, // Outer For cond-expression: .unrolled.iv < NumIterations ExprResult OuterConde = - BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, MakeOuterRef(), - MakeNumIterations()); + SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, + MakeOuterRef(), MakeNumIterations()); if (!OuterConde.isUsable()) return StmtError(); // Outer For incr-statement: .unrolled.iv += Factor ExprResult OuterIncr = - BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(), BO_AddAssign, - MakeOuterRef(), MakeFactorExpr()); + SemaRef.BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(), BO_AddAssign, + MakeOuterRef(), MakeFactorExpr()); if (!OuterIncr.isUsable()) return StmtError(); @@ -15555,10 +15604,11 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef Clauses, buildPreInits(Context, PreInits)); } -OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, + Expr *Expr, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { OMPClause *Res = nullptr; switch (Kind) { case OMPC_final: @@ -16646,19 +16696,17 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( return CaptureRegion; } -OMPClause *Sema::ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier, - Expr *Condition, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation NameModifierLoc, - SourceLocation ColonLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPIfClause( + OpenMPDirectiveKind NameModifier, Expr *Condition, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation NameModifierLoc, + SourceLocation ColonLoc, SourceLocation EndLoc) { Expr *ValExpr = Condition; Stmt *HelperValStmt = nullptr; OpenMPDirectiveKind CaptureRegion = OMPD_unknown; if (!Condition->isValueDependent() && !Condition->isTypeDependent() && !Condition->isInstantiationDependent() && !Condition->containsUnexpandedParameterPack()) { - ExprResult Val = CheckBooleanCondition(StartLoc, Condition); + ExprResult Val = SemaRef.CheckBooleanCondition(StartLoc, Condition); if (Val.isInvalid()) return nullptr; @@ -16666,57 +16714,60 @@ OMPClause *Sema::ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier, OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); CaptureRegion = getOpenMPCaptureRegionForClause( - DKind, OMPC_if, LangOpts.OpenMP, NameModifier); - if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + DKind, OMPC_if, getLangOpts().OpenMP, NameModifier); + if (CaptureRegion != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } } - return new (Context) + return new (getASTContext()) OMPIfClause(NameModifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, NameModifierLoc, ColonLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPFinalClause(Expr *Condition, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPFinalClause(Expr *Condition, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { Expr *ValExpr = Condition; Stmt *HelperValStmt = nullptr; OpenMPDirectiveKind CaptureRegion = OMPD_unknown; if (!Condition->isValueDependent() && !Condition->isTypeDependent() && !Condition->isInstantiationDependent() && !Condition->containsUnexpandedParameterPack()) { - ExprResult Val = CheckBooleanCondition(StartLoc, Condition); + ExprResult Val = SemaRef.CheckBooleanCondition(StartLoc, Condition); if (Val.isInvalid()) return nullptr; - ValExpr = MakeFullExpr(Val.get()).get(); + ValExpr = SemaRef.MakeFullExpr(Val.get()).get(); OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); - CaptureRegion = - getOpenMPCaptureRegionForClause(DKind, OMPC_final, LangOpts.OpenMP); - if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + CaptureRegion = getOpenMPCaptureRegionForClause(DKind, OMPC_final, + getLangOpts().OpenMP); + if (CaptureRegion != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } } - return new (Context) OMPFinalClause(ValExpr, HelperValStmt, CaptureRegion, - StartLoc, LParenLoc, EndLoc); + return new (getASTContext()) OMPFinalClause( + ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc); } -ExprResult Sema::PerformOpenMPImplicitIntegerConversion(SourceLocation Loc, - Expr *Op) { +ExprResult +SemaOpenMP::PerformOpenMPImplicitIntegerConversion(SourceLocation Loc, + Expr *Op) { if (!Op) return ExprError(); - class IntConvertDiagnoser : public ICEConvertDiagnoser { + class IntConvertDiagnoser : public Sema::ICEConvertDiagnoser { public: IntConvertDiagnoser() : ICEConvertDiagnoser(/*AllowScopedEnumerations*/ false, false, true) {} @@ -16752,7 +16803,7 @@ ExprResult Sema::PerformOpenMPImplicitIntegerConversion(SourceLocation Loc, llvm_unreachable("conversion functions are permitted"); } } ConvertDiagnoser; - return PerformContextualImplicitConversion(Loc, Op, ConvertDiagnoser); + return SemaRef.PerformContextualImplicitConversion(Loc, Op, ConvertDiagnoser); } static bool @@ -16765,7 +16816,7 @@ isNonNegativeIntegerValue(Expr *&ValExpr, Sema &SemaRef, OpenMPClauseKind CKind, !ValExpr->isInstantiationDependent()) { SourceLocation Loc = ValExpr->getExprLoc(); ExprResult Value = - SemaRef.PerformOpenMPImplicitIntegerConversion(Loc, ValExpr); + SemaRef.OpenMP().PerformOpenMPImplicitIntegerConversion(Loc, ValExpr); if (Value.isInvalid()) return false; @@ -16797,37 +16848,37 @@ isNonNegativeIntegerValue(Expr *&ValExpr, Sema &SemaRef, OpenMPClauseKind CKind, return true; } -OMPClause *Sema::ActOnOpenMPNumThreadsClause(Expr *NumThreads, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPNumThreadsClause(Expr *NumThreads, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { Expr *ValExpr = NumThreads; Stmt *HelperValStmt = nullptr; // OpenMP [2.5, Restrictions] // The num_threads expression must evaluate to a positive integer value. - if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_num_threads, + if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_num_threads, /*StrictlyPositive=*/true)) return nullptr; OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); - OpenMPDirectiveKind CaptureRegion = - getOpenMPCaptureRegionForClause(DKind, OMPC_num_threads, LangOpts.OpenMP); - if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause( + DKind, OMPC_num_threads, getLangOpts().OpenMP); + if (CaptureRegion != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } - return new (Context) OMPNumThreadsClause( + return new (getASTContext()) OMPNumThreadsClause( ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc); } -ExprResult Sema::VerifyPositiveIntegerConstantInClause(Expr *E, - OpenMPClauseKind CKind, - bool StrictlyPositive, - bool SuppressExprDiags) { +ExprResult SemaOpenMP::VerifyPositiveIntegerConstantInClause( + Expr *E, OpenMPClauseKind CKind, bool StrictlyPositive, + bool SuppressExprDiags) { if (!E) return ExprError(); if (E->isValueDependent() || E->isTypeDependent() || @@ -16841,14 +16892,16 @@ ExprResult Sema::VerifyPositiveIntegerConstantInClause(Expr *E, // expression. struct SuppressedDiagnoser : public Sema::VerifyICEDiagnoser { SuppressedDiagnoser() : VerifyICEDiagnoser(/*Suppress=*/true) {} - Sema::SemaDiagnosticBuilder diagnoseNotICE(Sema &S, - SourceLocation Loc) override { + SemaBase::SemaDiagnosticBuilder + diagnoseNotICE(Sema &S, SourceLocation Loc) override { llvm_unreachable("Diagnostic suppressed"); } } Diagnoser; - ICE = VerifyIntegerConstantExpression(E, &Result, Diagnoser, AllowFold); + ICE = SemaRef.VerifyIntegerConstantExpression(E, &Result, Diagnoser, + Sema::AllowFold); } else { - ICE = VerifyIntegerConstantExpression(E, &Result, /*FIXME*/ AllowFold); + ICE = SemaRef.VerifyIntegerConstantExpression(E, &Result, + /*FIXME*/ Sema::AllowFold); } if (ICE.isInvalid()) return ExprError(); @@ -16872,29 +16925,31 @@ ExprResult Sema::VerifyPositiveIntegerConstantInClause(Expr *E, return ICE; } -OMPClause *Sema::ActOnOpenMPSafelenClause(Expr *Len, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPSafelenClause(Expr *Len, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { // OpenMP [2.8.1, simd construct, Description] // The parameter of the safelen clause must be a constant // positive integer expression. ExprResult Safelen = VerifyPositiveIntegerConstantInClause(Len, OMPC_safelen); if (Safelen.isInvalid()) return nullptr; - return new (Context) + return new (getASTContext()) OMPSafelenClause(Safelen.get(), StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPSimdlenClause(Expr *Len, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPSimdlenClause(Expr *Len, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { // OpenMP [2.8.1, simd construct, Description] // The parameter of the simdlen clause must be a constant // positive integer expression. ExprResult Simdlen = VerifyPositiveIntegerConstantInClause(Len, OMPC_simdlen); if (Simdlen.isInvalid()) return nullptr; - return new (Context) + return new (getASTContext()) OMPSimdlenClause(Simdlen.get(), StartLoc, LParenLoc, EndLoc); } @@ -16954,31 +17009,32 @@ static bool findOMPAllocatorHandleT(Sema &S, SourceLocation Loc, return true; } -OMPClause *Sema::ActOnOpenMPAllocatorClause(Expr *A, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPAllocatorClause(Expr *A, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { // OpenMP [2.11.3, allocate Directive, Description] // allocator is an expression of omp_allocator_handle_t type. - if (!findOMPAllocatorHandleT(*this, A->getExprLoc(), DSAStack)) + if (!findOMPAllocatorHandleT(SemaRef, A->getExprLoc(), DSAStack)) return nullptr; - ExprResult Allocator = DefaultLvalueConversion(A); + ExprResult Allocator = SemaRef.DefaultLvalueConversion(A); if (Allocator.isInvalid()) return nullptr; - Allocator = PerformImplicitConversion(Allocator.get(), - DSAStack->getOMPAllocatorHandleT(), - Sema::AA_Initializing, - /*AllowExplicit=*/true); + Allocator = SemaRef.PerformImplicitConversion( + Allocator.get(), DSAStack->getOMPAllocatorHandleT(), + Sema::AA_Initializing, + /*AllowExplicit=*/true); if (Allocator.isInvalid()) return nullptr; - return new (Context) + return new (getASTContext()) OMPAllocatorClause(Allocator.get(), StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPCollapseClause(Expr *NumForLoops, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPCollapseClause(Expr *NumForLoops, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { // OpenMP [2.7.1, loop construct, Description] // OpenMP [2.8.1, simd construct, Description] // OpenMP [2.9.6, distribute construct, Description] @@ -16988,14 +17044,14 @@ OMPClause *Sema::ActOnOpenMPCollapseClause(Expr *NumForLoops, VerifyPositiveIntegerConstantInClause(NumForLoops, OMPC_collapse); if (NumForLoopsResult.isInvalid()) return nullptr; - return new (Context) + return new (getASTContext()) OMPCollapseClause(NumForLoopsResult.get(), StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPOrderedClause(SourceLocation StartLoc, - SourceLocation EndLoc, - SourceLocation LParenLoc, - Expr *NumForLoops) { +OMPClause *SemaOpenMP::ActOnOpenMPOrderedClause(SourceLocation StartLoc, + SourceLocation EndLoc, + SourceLocation LParenLoc, + Expr *NumForLoops) { // OpenMP [2.7.1, loop construct, Description] // OpenMP [2.8.1, simd construct, Description] // OpenMP [2.9.6, distribute construct, Description] @@ -17010,14 +17066,15 @@ OMPClause *Sema::ActOnOpenMPOrderedClause(SourceLocation StartLoc, } else { NumForLoops = nullptr; } - auto *Clause = OMPOrderedClause::Create( - Context, NumForLoops, NumForLoops ? DSAStack->getAssociatedLoops() : 0, - StartLoc, LParenLoc, EndLoc); + auto *Clause = + OMPOrderedClause::Create(getASTContext(), NumForLoops, + NumForLoops ? DSAStack->getAssociatedLoops() : 0, + StartLoc, LParenLoc, EndLoc); DSAStack->setOrderedRegion(/*IsOrdered=*/true, NumForLoops, Clause); return Clause; } -OMPClause *Sema::ActOnOpenMPSimpleClause( +OMPClause *SemaOpenMP::ActOnOpenMPSimpleClause( OpenMPClauseKind Kind, unsigned Argument, SourceLocation ArgumentLoc, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { OMPClause *Res = nullptr; @@ -17159,11 +17216,11 @@ getListOfPossibleValues(OpenMPClauseKind K, unsigned First, unsigned Last, return std::string(Out.str()); } -OMPClause *Sema::ActOnOpenMPDefaultClause(DefaultKind Kind, - SourceLocation KindKwLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPDefaultClause(DefaultKind Kind, + SourceLocation KindKwLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (Kind == OMP_DEFAULT_unknown) { Diag(KindKwLoc, diag::err_omp_unexpected_clause_value) << getListOfPossibleValues(OMPC_default, /*First=*/0, @@ -17189,39 +17246,39 @@ OMPClause *Sema::ActOnOpenMPDefaultClause(DefaultKind Kind, llvm_unreachable("DSA unexpected in OpenMP default clause"); } - return new (Context) + return new (getASTContext()) OMPDefaultClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPProcBindClause(ProcBindKind Kind, - SourceLocation KindKwLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPProcBindClause(ProcBindKind Kind, + SourceLocation KindKwLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (Kind == OMP_PROC_BIND_unknown) { Diag(KindKwLoc, diag::err_omp_unexpected_clause_value) << getListOfPossibleValues(OMPC_proc_bind, /*First=*/unsigned(OMP_PROC_BIND_master), /*Last=*/ - unsigned(LangOpts.OpenMP > 50 + unsigned(getLangOpts().OpenMP > 50 ? OMP_PROC_BIND_primary : OMP_PROC_BIND_spread) + 1) << getOpenMPClauseName(OMPC_proc_bind); return nullptr; } - if (Kind == OMP_PROC_BIND_primary && LangOpts.OpenMP < 51) + if (Kind == OMP_PROC_BIND_primary && getLangOpts().OpenMP < 51) Diag(KindKwLoc, diag::err_omp_unexpected_clause_value) << getListOfPossibleValues(OMPC_proc_bind, /*First=*/unsigned(OMP_PROC_BIND_master), /*Last=*/ unsigned(OMP_PROC_BIND_spread) + 1) << getOpenMPClauseName(OMPC_proc_bind); - return new (Context) + return new (getASTContext()) OMPProcBindClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPAtomicDefaultMemOrderClause( +OMPClause *SemaOpenMP::ActOnOpenMPAtomicDefaultMemOrderClause( OpenMPAtomicDefaultMemOrderClauseKind Kind, SourceLocation KindKwLoc, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { if (Kind == OMPC_ATOMIC_DEFAULT_MEM_ORDER_unknown) { @@ -17232,15 +17289,15 @@ OMPClause *Sema::ActOnOpenMPAtomicDefaultMemOrderClause( << getOpenMPClauseName(OMPC_atomic_default_mem_order); return nullptr; } - return new (Context) OMPAtomicDefaultMemOrderClause(Kind, KindKwLoc, StartLoc, - LParenLoc, EndLoc); + return new (getASTContext()) OMPAtomicDefaultMemOrderClause( + Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPAtClause(OpenMPAtClauseKind Kind, - SourceLocation KindKwLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPAtClause(OpenMPAtClauseKind Kind, + SourceLocation KindKwLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (Kind == OMPC_AT_unknown) { Diag(KindKwLoc, diag::err_omp_unexpected_clause_value) << getListOfPossibleValues(OMPC_at, /*First=*/0, @@ -17248,15 +17305,15 @@ OMPClause *Sema::ActOnOpenMPAtClause(OpenMPAtClauseKind Kind, << getOpenMPClauseName(OMPC_at); return nullptr; } - return new (Context) + return new (getASTContext()) OMPAtClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPSeverityClause(OpenMPSeverityClauseKind Kind, - SourceLocation KindKwLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPSeverityClause(OpenMPSeverityClauseKind Kind, + SourceLocation KindKwLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (Kind == OMPC_SEVERITY_unknown) { Diag(KindKwLoc, diag::err_omp_unexpected_clause_value) << getListOfPossibleValues(OMPC_severity, /*First=*/0, @@ -17264,28 +17321,30 @@ OMPClause *Sema::ActOnOpenMPSeverityClause(OpenMPSeverityClauseKind Kind, << getOpenMPClauseName(OMPC_severity); return nullptr; } - return new (Context) + return new (getASTContext()) OMPSeverityClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPMessageClause(Expr *ME, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPMessageClause(Expr *ME, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { assert(ME && "NULL expr in Message clause"); if (!isa(ME)) { Diag(ME->getBeginLoc(), diag::warn_clause_expected_string) << getOpenMPClauseName(OMPC_message); return nullptr; } - return new (Context) OMPMessageClause(ME, StartLoc, LParenLoc, EndLoc); + return new (getASTContext()) + OMPMessageClause(ME, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPOrderClause( +OMPClause *SemaOpenMP::ActOnOpenMPOrderClause( OpenMPOrderClauseModifier Modifier, OpenMPOrderClauseKind Kind, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc, SourceLocation KindLoc, SourceLocation EndLoc) { if (Kind != OMPC_ORDER_concurrent || - (LangOpts.OpenMP < 51 && MLoc.isValid())) { + (getLangOpts().OpenMP < 51 && MLoc.isValid())) { // Kind should be concurrent, // Modifiers introduced in OpenMP 5.1 static_assert(OMPC_ORDER_unknown > 0, @@ -17298,7 +17357,7 @@ OMPClause *Sema::ActOnOpenMPOrderClause( << getOpenMPClauseName(OMPC_order); return nullptr; } - if (LangOpts.OpenMP >= 51) { + if (getLangOpts().OpenMP >= 51) { if (Modifier == OMPC_ORDER_MODIFIER_unknown && MLoc.isValid()) { Diag(MLoc, diag::err_omp_unexpected_clause_value) << getListOfPossibleValues(OMPC_order, @@ -17315,21 +17374,21 @@ OMPClause *Sema::ActOnOpenMPOrderClause( } } } - return new (Context) OMPOrderClause(Kind, KindLoc, StartLoc, LParenLoc, - EndLoc, Modifier, MLoc); + return new (getASTContext()) OMPOrderClause( + Kind, KindLoc, StartLoc, LParenLoc, EndLoc, Modifier, MLoc); } -OMPClause *Sema::ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind, - SourceLocation KindKwLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind, + SourceLocation KindKwLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (Kind == OMPC_DEPEND_unknown || Kind == OMPC_DEPEND_source || Kind == OMPC_DEPEND_sink || Kind == OMPC_DEPEND_depobj) { SmallVector Except = { OMPC_DEPEND_source, OMPC_DEPEND_sink, OMPC_DEPEND_depobj, OMPC_DEPEND_outallmemory, OMPC_DEPEND_inoutallmemory}; - if (LangOpts.OpenMP < 51) + if (getLangOpts().OpenMP < 51) Except.push_back(OMPC_DEPEND_inoutset); Diag(KindKwLoc, diag::err_omp_unexpected_clause_value) << getListOfPossibleValues(OMPC_depend, /*First=*/0, @@ -17337,14 +17396,14 @@ OMPClause *Sema::ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind, << getOpenMPClauseName(OMPC_update); return nullptr; } - return OMPUpdateClause::Create(Context, StartLoc, LParenLoc, KindKwLoc, Kind, - EndLoc); + return OMPUpdateClause::Create(getASTContext(), StartLoc, LParenLoc, + KindKwLoc, Kind, EndLoc); } -OMPClause *Sema::ActOnOpenMPSizesClause(ArrayRef SizeExprs, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPSizesClause(ArrayRef SizeExprs, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { for (Expr *SizeExpr : SizeExprs) { ExprResult NumForLoopsResult = VerifyPositiveIntegerConstantInClause( SizeExpr, OMPC_sizes, /*StrictlyPositive=*/true); @@ -17353,19 +17412,19 @@ OMPClause *Sema::ActOnOpenMPSizesClause(ArrayRef SizeExprs, } DSAStack->setAssociatedLoops(SizeExprs.size()); - return OMPSizesClause::Create(Context, StartLoc, LParenLoc, EndLoc, + return OMPSizesClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc, SizeExprs); } -OMPClause *Sema::ActOnOpenMPFullClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return OMPFullClause::Create(Context, StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPFullClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return OMPFullClause::Create(getASTContext(), StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPPartialClause(Expr *FactorExpr, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPPartialClause(Expr *FactorExpr, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (FactorExpr) { // If an argument is specified, it must be a constant (or an unevaluated // template expression). @@ -17376,22 +17435,22 @@ OMPClause *Sema::ActOnOpenMPPartialClause(Expr *FactorExpr, FactorExpr = FactorResult.get(); } - return OMPPartialClause::Create(Context, StartLoc, LParenLoc, EndLoc, + return OMPPartialClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc, FactorExpr); } -OMPClause *Sema::ActOnOpenMPAlignClause(Expr *A, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPAlignClause(Expr *A, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { ExprResult AlignVal; AlignVal = VerifyPositiveIntegerConstantInClause(A, OMPC_align); if (AlignVal.isInvalid()) return nullptr; - return OMPAlignClause::Create(Context, AlignVal.get(), StartLoc, LParenLoc, - EndLoc); + return OMPAlignClause::Create(getASTContext(), AlignVal.get(), StartLoc, + LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause( +OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause( OpenMPClauseKind Kind, ArrayRef Argument, Expr *Expr, SourceLocation StartLoc, SourceLocation LParenLoc, ArrayRef ArgumentLoc, SourceLocation DelimLoc, @@ -17559,13 +17618,13 @@ static bool checkScheduleModifiers(Sema &S, OpenMPScheduleClauseModifier M1, return false; } -OMPClause *Sema::ActOnOpenMPScheduleClause( +OMPClause *SemaOpenMP::ActOnOpenMPScheduleClause( OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2, OpenMPScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc, SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc) { - if (checkScheduleModifiers(*this, M1, M2, M1Loc, M2Loc) || - checkScheduleModifiers(*this, M2, M1, M2Loc, M1Loc)) + if (checkScheduleModifiers(SemaRef, M1, M2, M1Loc, M2Loc) || + checkScheduleModifiers(SemaRef, M2, M1, M2Loc, M1Loc)) return nullptr; // OpenMP, 2.7.1, Loop Construct, Restrictions // Either the monotonic modifier or the nonmonotonic modifier can be specified @@ -17599,7 +17658,7 @@ OMPClause *Sema::ActOnOpenMPScheduleClause( // The nonmonotonic modifier can only be specified with schedule(dynamic) or // schedule(guided). // OpenMP 5.0 does not have this restriction. - if (LangOpts.OpenMP < 50 && + if (getLangOpts().OpenMP < 50 && (M1 == OMPC_SCHEDULE_MODIFIER_nonmonotonic || M2 == OMPC_SCHEDULE_MODIFIER_nonmonotonic) && Kind != OMPC_SCHEDULE_dynamic && Kind != OMPC_SCHEDULE_guided) { @@ -17625,7 +17684,7 @@ OMPClause *Sema::ActOnOpenMPScheduleClause( // chunk_size must be a loop invariant integer expression with a positive // value. if (std::optional Result = - ValExpr->getIntegerConstantExpr(Context)) { + ValExpr->getIntegerConstantExpr(getASTContext())) { if (Result->isSigned() && !Result->isStrictlyPositive()) { Diag(ChunkSizeLoc, diag::err_omp_negative_expression_in_clause) << "schedule" << 1 << ChunkSize->getSourceRange(); @@ -17633,24 +17692,24 @@ OMPClause *Sema::ActOnOpenMPScheduleClause( } } else if (getOpenMPCaptureRegionForClause( DSAStack->getCurrentDirective(), OMPC_schedule, - LangOpts.OpenMP) != OMPD_unknown && - !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + getLangOpts().OpenMP) != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } } } - return new (Context) + return new (getASTContext()) OMPScheduleClause(StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc, Kind, ValExpr, HelperValStmt, M1, M1Loc, M2, M2Loc); } -OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind, - SourceLocation StartLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPClause(OpenMPClauseKind Kind, + SourceLocation StartLoc, + SourceLocation EndLoc) { OMPClause *Res = nullptr; switch (Kind) { case OMPC_ordered: @@ -17804,134 +17863,138 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind, return Res; } -OMPClause *Sema::ActOnOpenMPNowaitClause(SourceLocation StartLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPNowaitClause(SourceLocation StartLoc, + SourceLocation EndLoc) { DSAStack->setNowaitRegion(); - return new (Context) OMPNowaitClause(StartLoc, EndLoc); + return new (getASTContext()) OMPNowaitClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPUntiedClause(SourceLocation StartLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPUntiedClause(SourceLocation StartLoc, + SourceLocation EndLoc) { DSAStack->setUntiedRegion(); - return new (Context) OMPUntiedClause(StartLoc, EndLoc); + return new (getASTContext()) OMPUntiedClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPMergeableClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPMergeableClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPMergeableClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPMergeableClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPReadClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPReadClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPReadClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPReadClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPWriteClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPWriteClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPWriteClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPWriteClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPUpdateClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return OMPUpdateClause::Create(Context, StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPUpdateClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return OMPUpdateClause::Create(getASTContext(), StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPCaptureClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPCaptureClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPCaptureClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPCaptureClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPCompareClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPCompareClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPCompareClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPCompareClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPFailClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPFailClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPFailClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPFailClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPFailClause( - OpenMPClauseKind Parameter, SourceLocation KindLoc, - SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPFailClause(OpenMPClauseKind Parameter, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (!checkFailClauseParameter(Parameter)) { Diag(KindLoc, diag::err_omp_atomic_fail_wrong_or_no_clauses); return nullptr; } - return new (Context) + return new (getASTContext()) OMPFailClause(Parameter, KindLoc, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPSeqCstClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPSeqCstClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPSeqCstClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPSeqCstClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPAcqRelClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPAcqRelClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPAcqRelClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPAcqRelClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPAcquireClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPAcquireClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPAcquireClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPAcquireClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPReleaseClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPReleaseClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPReleaseClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPReleaseClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPRelaxedClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPRelaxedClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPRelaxedClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPRelaxedClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPWeakClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPWeakClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPWeakClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPWeakClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPThreadsClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPThreadsClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPThreadsClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPThreadsClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPSIMDClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPSIMDClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPSIMDClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPSIMDClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPNogroupClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPNogroupClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPNogroupClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPNogroupClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPUnifiedAddressClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPUnifiedAddressClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPUnifiedAddressClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPUnifiedAddressClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPUnifiedSharedMemoryClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPUnifiedSharedMemoryClause(StartLoc, EndLoc); +OMPClause * +SemaOpenMP::ActOnOpenMPUnifiedSharedMemoryClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPUnifiedSharedMemoryClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPReverseOffloadClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPReverseOffloadClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPReverseOffloadClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPReverseOffloadClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPDynamicAllocatorsClause(StartLoc, EndLoc); +OMPClause * +SemaOpenMP::ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPDynamicAllocatorsClause(StartLoc, EndLoc); } -StmtResult Sema::ActOnOpenMPInteropDirective(ArrayRef Clauses, - SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult +SemaOpenMP::ActOnOpenMPInteropDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc) { // OpenMP 5.1 [2.15.1, interop Construct, Restrictions] // At least one action-clause must appear on a directive. @@ -17981,13 +18044,13 @@ StmtResult Sema::ActOnOpenMPInteropDirective(ArrayRef Clauses, if (ClauseKind == OMPC_init) { auto *E = cast(C)->getInteropVar(); - DeclResult = getPrivateItem(*this, E, ELoc, ERange); + DeclResult = getPrivateItem(SemaRef, E, ELoc, ERange); } else if (ClauseKind == OMPC_use) { auto *E = cast(C)->getInteropVar(); - DeclResult = getPrivateItem(*this, E, ELoc, ERange); + DeclResult = getPrivateItem(SemaRef, E, ELoc, ERange); } else if (ClauseKind == OMPC_destroy) { auto *E = cast(C)->getInteropVar(); - DeclResult = getPrivateItem(*this, E, ELoc, ERange); + DeclResult = getPrivateItem(SemaRef, E, ELoc, ERange); } if (DeclResult.first) { @@ -17999,7 +18062,8 @@ StmtResult Sema::ActOnOpenMPInteropDirective(ArrayRef Clauses, } } - return OMPInteropDirective::Create(Context, StartLoc, EndLoc, Clauses); + return OMPInteropDirective::Create(getASTContext(), StartLoc, EndLoc, + Clauses); } static bool isValidInteropVariable(Sema &SemaRef, Expr *InteropVarExpr, @@ -18059,12 +18123,11 @@ static bool isValidInteropVariable(Sema &SemaRef, Expr *InteropVarExpr, return true; } -OMPClause * -Sema::ActOnOpenMPInitClause(Expr *InteropVar, OMPInteropInfo &InteropInfo, - SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation VarLoc, SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPInitClause( + Expr *InteropVar, OMPInteropInfo &InteropInfo, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation VarLoc, SourceLocation EndLoc) { - if (!isValidInteropVariable(*this, InteropVar, VarLoc, OMPC_init)) + if (!isValidInteropVariable(SemaRef, InteropVar, VarLoc, OMPC_init)) return nullptr; // Check prefer_type values. These foreign-runtime-id values are either @@ -18073,7 +18136,7 @@ Sema::ActOnOpenMPInitClause(Expr *InteropVar, OMPInteropInfo &InteropInfo, if (E->isValueDependent() || E->isTypeDependent() || E->isInstantiationDependent() || E->containsUnexpandedParameterPack()) continue; - if (E->isIntegerConstantExpr(Context)) + if (E->isIntegerConstantExpr(getASTContext())) continue; if (isa(E)) continue; @@ -18081,28 +18144,29 @@ Sema::ActOnOpenMPInitClause(Expr *InteropVar, OMPInteropInfo &InteropInfo, return nullptr; } - return OMPInitClause::Create(Context, InteropVar, InteropInfo, StartLoc, - LParenLoc, VarLoc, EndLoc); + return OMPInitClause::Create(getASTContext(), InteropVar, InteropInfo, + StartLoc, LParenLoc, VarLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPUseClause(Expr *InteropVar, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation VarLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPUseClause(Expr *InteropVar, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation VarLoc, + SourceLocation EndLoc) { - if (!isValidInteropVariable(*this, InteropVar, VarLoc, OMPC_use)) + if (!isValidInteropVariable(SemaRef, InteropVar, VarLoc, OMPC_use)) return nullptr; - return new (Context) + return new (getASTContext()) OMPUseClause(InteropVar, StartLoc, LParenLoc, VarLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPDestroyClause(Expr *InteropVar, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation VarLoc, - SourceLocation EndLoc) { - if (!InteropVar && LangOpts.OpenMP >= 52 && +OMPClause *SemaOpenMP::ActOnOpenMPDestroyClause(Expr *InteropVar, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation VarLoc, + SourceLocation EndLoc) { + if (!InteropVar && getLangOpts().OpenMP >= 52 && DSAStack->getCurrentDirective() == OMPD_depobj) { Diag(StartLoc, diag::err_omp_expected_clause_argument) << getOpenMPClauseName(OMPC_destroy) @@ -18110,100 +18174,103 @@ OMPClause *Sema::ActOnOpenMPDestroyClause(Expr *InteropVar, return nullptr; } if (InteropVar && - !isValidInteropVariable(*this, InteropVar, VarLoc, OMPC_destroy)) + !isValidInteropVariable(SemaRef, InteropVar, VarLoc, OMPC_destroy)) return nullptr; - return new (Context) + return new (getASTContext()) OMPDestroyClause(InteropVar, StartLoc, LParenLoc, VarLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPNovariantsClause(Expr *Condition, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPNovariantsClause(Expr *Condition, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { Expr *ValExpr = Condition; Stmt *HelperValStmt = nullptr; OpenMPDirectiveKind CaptureRegion = OMPD_unknown; if (!Condition->isValueDependent() && !Condition->isTypeDependent() && !Condition->isInstantiationDependent() && !Condition->containsUnexpandedParameterPack()) { - ExprResult Val = CheckBooleanCondition(StartLoc, Condition); + ExprResult Val = SemaRef.CheckBooleanCondition(StartLoc, Condition); if (Val.isInvalid()) return nullptr; - ValExpr = MakeFullExpr(Val.get()).get(); + ValExpr = SemaRef.MakeFullExpr(Val.get()).get(); OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); CaptureRegion = getOpenMPCaptureRegionForClause(DKind, OMPC_novariants, - LangOpts.OpenMP); - if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + getLangOpts().OpenMP); + if (CaptureRegion != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } } - return new (Context) OMPNovariantsClause( + return new (getASTContext()) OMPNovariantsClause( ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPNocontextClause(Expr *Condition, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPNocontextClause(Expr *Condition, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { Expr *ValExpr = Condition; Stmt *HelperValStmt = nullptr; OpenMPDirectiveKind CaptureRegion = OMPD_unknown; if (!Condition->isValueDependent() && !Condition->isTypeDependent() && !Condition->isInstantiationDependent() && !Condition->containsUnexpandedParameterPack()) { - ExprResult Val = CheckBooleanCondition(StartLoc, Condition); + ExprResult Val = SemaRef.CheckBooleanCondition(StartLoc, Condition); if (Val.isInvalid()) return nullptr; - ValExpr = MakeFullExpr(Val.get()).get(); + ValExpr = SemaRef.MakeFullExpr(Val.get()).get(); OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); - CaptureRegion = - getOpenMPCaptureRegionForClause(DKind, OMPC_nocontext, LangOpts.OpenMP); - if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + CaptureRegion = getOpenMPCaptureRegionForClause(DKind, OMPC_nocontext, + getLangOpts().OpenMP); + if (CaptureRegion != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } } - return new (Context) OMPNocontextClause(ValExpr, HelperValStmt, CaptureRegion, - StartLoc, LParenLoc, EndLoc); + return new (getASTContext()) OMPNocontextClause( + ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPFilterClause(Expr *ThreadID, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPFilterClause(Expr *ThreadID, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { Expr *ValExpr = ThreadID; Stmt *HelperValStmt = nullptr; OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); OpenMPDirectiveKind CaptureRegion = - getOpenMPCaptureRegionForClause(DKind, OMPC_filter, LangOpts.OpenMP); - if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + getOpenMPCaptureRegionForClause(DKind, OMPC_filter, getLangOpts().OpenMP); + if (CaptureRegion != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } - return new (Context) OMPFilterClause(ValExpr, HelperValStmt, CaptureRegion, - StartLoc, LParenLoc, EndLoc); + return new (getASTContext()) OMPFilterClause( + ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPVarListClause(OpenMPClauseKind Kind, - ArrayRef VarList, - const OMPVarListLocTy &Locs, - OpenMPVarListDataTy &Data) { +OMPClause *SemaOpenMP::ActOnOpenMPVarListClause(OpenMPClauseKind Kind, + ArrayRef VarList, + const OMPVarListLocTy &Locs, + OpenMPVarListDataTy &Data) { SourceLocation StartLoc = Locs.StartLoc; SourceLocation LParenLoc = Locs.LParenLoc; SourceLocation EndLoc = Locs.EndLoc; @@ -18395,29 +18462,30 @@ OMPClause *Sema::ActOnOpenMPVarListClause(OpenMPClauseKind Kind, return Res; } -ExprResult Sema::getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK, - ExprObjectKind OK, SourceLocation Loc) { - ExprResult Res = BuildDeclRefExpr( +ExprResult SemaOpenMP::getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK, + ExprObjectKind OK, + SourceLocation Loc) { + ExprResult Res = SemaRef.BuildDeclRefExpr( Capture, Capture->getType().getNonReferenceType(), VK_LValue, Loc); if (!Res.isUsable()) return ExprError(); if (OK == OK_Ordinary && !getLangOpts().CPlusPlus) { - Res = CreateBuiltinUnaryOp(Loc, UO_Deref, Res.get()); + Res = SemaRef.CreateBuiltinUnaryOp(Loc, UO_Deref, Res.get()); if (!Res.isUsable()) return ExprError(); } if (VK != VK_LValue && Res.get()->isGLValue()) { - Res = DefaultLvalueConversion(Res.get()); + Res = SemaRef.DefaultLvalueConversion(Res.get()); if (!Res.isUsable()) return ExprError(); } return Res; } -OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPPrivateClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { SmallVector Vars; SmallVector PrivateCopies; bool IsImplicitClause = @@ -18427,7 +18495,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef VarList, SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) { // It will be analyzed later. Vars.push_back(RefExpr); @@ -18443,7 +18511,8 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef VarList, // OpenMP [2.9.3.3, Restrictions, C/C++, p.3] // A variable that appears in a private clause must not have an incomplete // type or a reference type. - if (RequireCompleteType(ELoc, Type, diag::err_omp_private_incomplete_type)) + if (SemaRef.RequireCompleteType(ELoc, Type, + diag::err_omp_private_incomplete_type)) continue; Type = Type.getNonReferenceType(); @@ -18455,7 +18524,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef VarList, // OpenMP 3.1 [2.9.3.3, private clause, Restrictions] // A variable that appears in a private clause must not have a // const-qualified type unless it is of class type with a mutable member. - if (rejectConstNotMutableType(*this, D, Type, OMPC_private, ELoc)) + if (rejectConstNotMutableType(SemaRef, D, Type, OMPC_private, ELoc)) continue; // OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced @@ -18469,7 +18538,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef VarList, if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_private) { Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind) << getOpenMPClauseName(OMPC_private); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } @@ -18480,7 +18549,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef VarList, Diag(ELoc, diag::err_omp_variably_modified_type_not_supported) << getOpenMPClauseName(OMPC_private) << Type << getOpenMPDirectiveName(CurrDir); - bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) == + bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) == VarDecl::DeclarationOnly; Diag(D->getLocation(), IsDecl ? diag::note_previous_decl : diag::note_defined_here) @@ -18496,7 +18565,8 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef VarList, // A list item cannot appear in both a map clause and a data-sharing // attribute clause on the same construct unless the construct is a // combined construct. - if ((LangOpts.OpenMP <= 45 && isOpenMPTargetExecutionDirective(CurrDir)) || + if ((getLangOpts().OpenMP <= 45 && + isOpenMPTargetExecutionDirective(CurrDir)) || CurrDir == OMPD_target) { OpenMPClauseKind ConflictKind; if (DSAStack->checkMappableExprComponentListsForDecl( @@ -18510,7 +18580,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef VarList, << getOpenMPClauseName(OMPC_private) << getOpenMPClauseName(ConflictKind) << getOpenMPDirectiveName(CurrDir); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } } @@ -18526,28 +18596,28 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef VarList, // proper diagnostics. Type = Type.getUnqualifiedType(); VarDecl *VDPrivate = - buildVarDecl(*this, ELoc, Type, D->getName(), + buildVarDecl(SemaRef, ELoc, Type, D->getName(), D->hasAttrs() ? &D->getAttrs() : nullptr, VD ? cast(SimpleRefExpr) : nullptr); - ActOnUninitializedDecl(VDPrivate); + SemaRef.ActOnUninitializedDecl(VDPrivate); if (VDPrivate->isInvalidDecl()) continue; DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr( - *this, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc); + SemaRef, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc); DeclRefExpr *Ref = nullptr; - if (!VD && !CurContext->isDependentContext()) { + if (!VD && !SemaRef.CurContext->isDependentContext()) { auto *FD = dyn_cast(D); VarDecl *VD = FD ? DSAStack->getImplicitFDCapExprDecl(FD) : nullptr; if (VD) - Ref = buildDeclRefExpr(*this, VD, VD->getType().getNonReferenceType(), + Ref = buildDeclRefExpr(SemaRef, VD, VD->getType().getNonReferenceType(), RefExpr->getExprLoc()); else - Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false); + Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/false); } if (!IsImplicitClause) DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_private, Ref); - Vars.push_back((VD || CurContext->isDependentContext()) + Vars.push_back((VD || SemaRef.CurContext->isDependentContext()) ? RefExpr->IgnoreParens() : Ref); PrivateCopies.push_back(VDPrivateRefExpr); @@ -18556,14 +18626,14 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef VarList, if (Vars.empty()) return nullptr; - return OMPPrivateClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars, - PrivateCopies); + return OMPPrivateClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc, + Vars, PrivateCopies); } -OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPFirstprivateClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { SmallVector Vars; SmallVector PrivateCopies; SmallVector Inits; @@ -18577,7 +18647,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) { // It will be analyzed later. Vars.push_back(RefExpr); @@ -18595,8 +18665,8 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, // OpenMP [2.9.3.3, Restrictions, C/C++, p.3] // A variable that appears in a private clause must not have an incomplete // type or a reference type. - if (RequireCompleteType(ELoc, Type, - diag::err_omp_firstprivate_incomplete_type)) + if (SemaRef.RequireCompleteType(ELoc, Type, + diag::err_omp_firstprivate_incomplete_type)) continue; Type = Type.getNonReferenceType(); @@ -18604,7 +18674,8 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, // A variable of class type (or array thereof) that appears in a private // clause requires an accessible, unambiguous copy constructor for the // class type. - QualType ElemType = Context.getBaseElementType(Type).getNonReferenceType(); + QualType ElemType = + getASTContext().getBaseElementType(Type).getNonReferenceType(); // If an implicit firstprivate variable found it was checked already. DSAStackTy::DSAVarData TopDVar; @@ -18613,7 +18684,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, DSAStack->getTopDSA(D, /*FromParent=*/false); TopDVar = DVar; OpenMPDirectiveKind CurrDir = DSAStack->getCurrentDirective(); - bool IsConstant = ElemType.isConstant(Context); + bool IsConstant = ElemType.isConstant(getASTContext()); // OpenMP [2.4.13, Data-sharing Attribute Clauses] // A list item that specifies a given variable may not appear in more // than one clause on the same directive, except that a variable may be @@ -18628,7 +18699,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind) << getOpenMPClauseName(OMPC_firstprivate); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } @@ -18648,7 +18719,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind) << getOpenMPClauseName(OMPC_firstprivate); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } @@ -18679,7 +18750,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, Diag(ELoc, diag::err_omp_required_access) << getOpenMPClauseName(OMPC_firstprivate) << getOpenMPClauseName(OMPC_shared); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } } @@ -18712,7 +18783,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, isOpenMPTeamsDirective(DVar.DKind))) { Diag(ELoc, diag::err_omp_parallel_reduction_in_task_firstprivate) << getOpenMPDirectiveName(DVar.DKind); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } } @@ -18725,7 +18796,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, // A list item cannot appear in both a map clause and a data-sharing // attribute clause on the same construct unless the construct is a // combined construct. - if ((LangOpts.OpenMP <= 45 && + if ((getLangOpts().OpenMP <= 45 && isOpenMPTargetExecutionDirective(CurrDir)) || CurrDir == OMPD_target) { OpenMPClauseKind ConflictKind; @@ -18741,7 +18812,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, << getOpenMPClauseName(OMPC_firstprivate) << getOpenMPClauseName(ConflictKind) << getOpenMPDirectiveName(DSAStack->getCurrentDirective()); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } } @@ -18753,7 +18824,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, Diag(ELoc, diag::err_omp_variably_modified_type_not_supported) << getOpenMPClauseName(OMPC_firstprivate) << Type << getOpenMPDirectiveName(DSAStack->getCurrentDirective()); - bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) == + bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) == VarDecl::DeclarationOnly; Diag(D->getLocation(), IsDecl ? diag::note_previous_decl : diag::note_defined_here) @@ -18763,7 +18834,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, Type = Type.getUnqualifiedType(); VarDecl *VDPrivate = - buildVarDecl(*this, ELoc, Type, D->getName(), + buildVarDecl(SemaRef, ELoc, Type, D->getName(), D->hasAttrs() ? &D->getAttrs() : nullptr, VD ? cast(SimpleRefExpr) : nullptr); // Generate helper private variable and initialize it with the value of the @@ -18776,32 +18847,32 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, // original array element in CodeGen. if (Type->isArrayType()) { VarDecl *VDInit = - buildVarDecl(*this, RefExpr->getExprLoc(), ElemType, D->getName()); - VDInitRefExpr = buildDeclRefExpr(*this, VDInit, ElemType, ELoc); - Expr *Init = DefaultLvalueConversion(VDInitRefExpr).get(); + buildVarDecl(SemaRef, RefExpr->getExprLoc(), ElemType, D->getName()); + VDInitRefExpr = buildDeclRefExpr(SemaRef, VDInit, ElemType, ELoc); + Expr *Init = SemaRef.DefaultLvalueConversion(VDInitRefExpr).get(); ElemType = ElemType.getUnqualifiedType(); - VarDecl *VDInitTemp = buildVarDecl(*this, RefExpr->getExprLoc(), ElemType, - ".firstprivate.temp"); + VarDecl *VDInitTemp = buildVarDecl(SemaRef, RefExpr->getExprLoc(), + ElemType, ".firstprivate.temp"); InitializedEntity Entity = InitializedEntity::InitializeVariable(VDInitTemp); InitializationKind Kind = InitializationKind::CreateCopy(ELoc, ELoc); - InitializationSequence InitSeq(*this, Entity, Kind, Init); - ExprResult Result = InitSeq.Perform(*this, Entity, Kind, Init); + InitializationSequence InitSeq(SemaRef, Entity, Kind, Init); + ExprResult Result = InitSeq.Perform(SemaRef, Entity, Kind, Init); if (Result.isInvalid()) VDPrivate->setInvalidDecl(); else VDPrivate->setInit(Result.getAs()); // Remove temp variable declaration. - Context.Deallocate(VDInitTemp); + getASTContext().Deallocate(VDInitTemp); } else { - VarDecl *VDInit = buildVarDecl(*this, RefExpr->getExprLoc(), Type, + VarDecl *VDInit = buildVarDecl(SemaRef, RefExpr->getExprLoc(), Type, ".firstprivate.temp"); - VDInitRefExpr = buildDeclRefExpr(*this, VDInit, RefExpr->getType(), + VDInitRefExpr = buildDeclRefExpr(SemaRef, VDInit, RefExpr->getType(), RefExpr->getExprLoc()); - AddInitializerToDecl(VDPrivate, - DefaultLvalueConversion(VDInitRefExpr).get(), - /*DirectInit=*/false); + SemaRef.AddInitializerToDecl( + VDPrivate, SemaRef.DefaultLvalueConversion(VDInitRefExpr).get(), + /*DirectInit=*/false); } if (VDPrivate->isInvalidDecl()) { if (IsImplicitClause) { @@ -18810,29 +18881,30 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, } continue; } - CurContext->addDecl(VDPrivate); + SemaRef.CurContext->addDecl(VDPrivate); DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr( - *this, VDPrivate, RefExpr->getType().getUnqualifiedType(), + SemaRef, VDPrivate, RefExpr->getType().getUnqualifiedType(), RefExpr->getExprLoc()); DeclRefExpr *Ref = nullptr; - if (!VD && !CurContext->isDependentContext()) { + if (!VD && !SemaRef.CurContext->isDependentContext()) { if (TopDVar.CKind == OMPC_lastprivate) { Ref = TopDVar.PrivateCopy; } else { auto *FD = dyn_cast(D); VarDecl *VD = FD ? DSAStack->getImplicitFDCapExprDecl(FD) : nullptr; if (VD) - Ref = buildDeclRefExpr(*this, VD, VD->getType().getNonReferenceType(), - RefExpr->getExprLoc()); + Ref = + buildDeclRefExpr(SemaRef, VD, VD->getType().getNonReferenceType(), + RefExpr->getExprLoc()); else - Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true); + Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true); if (VD || !isOpenMPCapturedDecl(D)) ExprCaptures.push_back(Ref->getDecl()); } } if (!IsImplicitClause) DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref); - Vars.push_back((VD || CurContext->isDependentContext()) + Vars.push_back((VD || SemaRef.CurContext->isDependentContext()) ? RefExpr->IgnoreParens() : Ref); PrivateCopies.push_back(VDPrivateRefExpr); @@ -18842,12 +18914,12 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef VarList, if (Vars.empty()) return nullptr; - return OMPFirstprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc, - Vars, PrivateCopies, Inits, - buildPreInits(Context, ExprCaptures)); + return OMPFirstprivateClause::Create( + getASTContext(), StartLoc, LParenLoc, EndLoc, Vars, PrivateCopies, Inits, + buildPreInits(getASTContext(), ExprCaptures)); } -OMPClause *Sema::ActOnOpenMPLastprivateClause( +OMPClause *SemaOpenMP::ActOnOpenMPLastprivateClause( ArrayRef VarList, OpenMPLastprivateModifier LPKind, SourceLocation LPKindLoc, SourceLocation ColonLoc, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { @@ -18871,7 +18943,7 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause( SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) { // It will be analyzed later. Vars.push_back(RefExpr); @@ -18889,8 +18961,8 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause( // OpenMP [2.14.3.5, Restrictions, C/C++, p.2] // A variable that appears in a lastprivate clause must not have an // incomplete type or a reference type. - if (RequireCompleteType(ELoc, Type, - diag::err_omp_lastprivate_incomplete_type)) + if (SemaRef.RequireCompleteType(ELoc, Type, + diag::err_omp_lastprivate_incomplete_type)) continue; Type = Type.getNonReferenceType(); @@ -18902,7 +18974,7 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause( // OpenMP 3.1 [2.9.3.5, lastprivate clause, Restrictions] // A variable that appears in a lastprivate clause must not have a // const-qualified type unless it is of class type with a mutable member. - if (rejectConstNotMutableType(*this, D, Type, OMPC_lastprivate, ELoc)) + if (rejectConstNotMutableType(SemaRef, D, Type, OMPC_lastprivate, ELoc)) continue; // OpenMP 5.0 [2.19.4.5 lastprivate Clause, Restrictions] @@ -18910,7 +18982,7 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause( // modifier must be a scalar variable. if (LPKind == OMPC_LASTPRIVATE_conditional && !Type->isScalarType()) { Diag(ELoc, diag::err_omp_lastprivate_conditional_non_scalar); - bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) == + bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) == VarDecl::DeclarationOnly; Diag(D->getLocation(), IsDecl ? diag::note_previous_decl : diag::note_defined_here) @@ -18935,7 +19007,7 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause( Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind) << getOpenMPClauseName(OMPC_lastprivate); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } @@ -18954,7 +19026,7 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause( Diag(ELoc, diag::err_omp_required_access) << getOpenMPClauseName(OMPC_lastprivate) << getOpenMPClauseName(OMPC_shared); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } } @@ -18967,53 +19039,53 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause( // A variable of class type (or array thereof) that appears in a // lastprivate clause requires an accessible, unambiguous copy assignment // operator for the class type. - Type = Context.getBaseElementType(Type).getNonReferenceType(); - VarDecl *SrcVD = buildVarDecl(*this, ERange.getBegin(), + Type = getASTContext().getBaseElementType(Type).getNonReferenceType(); + VarDecl *SrcVD = buildVarDecl(SemaRef, ERange.getBegin(), Type.getUnqualifiedType(), ".lastprivate.src", D->hasAttrs() ? &D->getAttrs() : nullptr); DeclRefExpr *PseudoSrcExpr = - buildDeclRefExpr(*this, SrcVD, Type.getUnqualifiedType(), ELoc); + buildDeclRefExpr(SemaRef, SrcVD, Type.getUnqualifiedType(), ELoc); VarDecl *DstVD = - buildVarDecl(*this, ERange.getBegin(), Type, ".lastprivate.dst", + buildVarDecl(SemaRef, ERange.getBegin(), Type, ".lastprivate.dst", D->hasAttrs() ? &D->getAttrs() : nullptr); - DeclRefExpr *PseudoDstExpr = buildDeclRefExpr(*this, DstVD, Type, ELoc); + DeclRefExpr *PseudoDstExpr = buildDeclRefExpr(SemaRef, DstVD, Type, ELoc); // For arrays generate assignment operation for single element and replace // it by the original array element in CodeGen. - ExprResult AssignmentOp = BuildBinOp(/*S=*/nullptr, ELoc, BO_Assign, - PseudoDstExpr, PseudoSrcExpr); + ExprResult AssignmentOp = SemaRef.BuildBinOp(/*S=*/nullptr, ELoc, BO_Assign, + PseudoDstExpr, PseudoSrcExpr); if (AssignmentOp.isInvalid()) continue; - AssignmentOp = - ActOnFinishFullExpr(AssignmentOp.get(), ELoc, /*DiscardedValue*/ false); + AssignmentOp = SemaRef.ActOnFinishFullExpr(AssignmentOp.get(), ELoc, + /*DiscardedValue*/ false); if (AssignmentOp.isInvalid()) continue; DeclRefExpr *Ref = nullptr; - if (!VD && !CurContext->isDependentContext()) { + if (!VD && !SemaRef.CurContext->isDependentContext()) { if (TopDVar.CKind == OMPC_firstprivate) { Ref = TopDVar.PrivateCopy; } else { - Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false); + Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/false); if (!isOpenMPCapturedDecl(D)) ExprCaptures.push_back(Ref->getDecl()); } if ((TopDVar.CKind == OMPC_firstprivate && !TopDVar.PrivateCopy) || (!isOpenMPCapturedDecl(D) && Ref->getDecl()->hasAttr())) { - ExprResult RefRes = DefaultLvalueConversion(Ref); + ExprResult RefRes = SemaRef.DefaultLvalueConversion(Ref); if (!RefRes.isUsable()) continue; ExprResult PostUpdateRes = - BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign, SimpleRefExpr, - RefRes.get()); + SemaRef.BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign, + SimpleRefExpr, RefRes.get()); if (!PostUpdateRes.isUsable()) continue; ExprPostUpdates.push_back( - IgnoredValueConversions(PostUpdateRes.get()).get()); + SemaRef.IgnoredValueConversions(PostUpdateRes.get()).get()); } } DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_lastprivate, Ref); - Vars.push_back((VD || CurContext->isDependentContext()) + Vars.push_back((VD || SemaRef.CurContext->isDependentContext()) ? RefExpr->IgnoreParens() : Ref); SrcExprs.push_back(PseudoSrcExpr); @@ -19024,24 +19096,24 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause( if (Vars.empty()) return nullptr; - return OMPLastprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc, - Vars, SrcExprs, DstExprs, AssignmentOps, - LPKind, LPKindLoc, ColonLoc, - buildPreInits(Context, ExprCaptures), - buildPostUpdate(*this, ExprPostUpdates)); + return OMPLastprivateClause::Create( + getASTContext(), StartLoc, LParenLoc, EndLoc, Vars, SrcExprs, DstExprs, + AssignmentOps, LPKind, LPKindLoc, ColonLoc, + buildPreInits(getASTContext(), ExprCaptures), + buildPostUpdate(SemaRef, ExprPostUpdates)); } -OMPClause *Sema::ActOnOpenMPSharedClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPSharedClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { SmallVector Vars; for (Expr *RefExpr : VarList) { assert(RefExpr && "NULL expr in OpenMP lastprivate clause."); SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) { // It will be analyzed later. Vars.push_back(RefExpr); @@ -19063,15 +19135,16 @@ OMPClause *Sema::ActOnOpenMPSharedClause(ArrayRef VarList, DVar.RefExpr) { Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind) << getOpenMPClauseName(OMPC_shared); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } DeclRefExpr *Ref = nullptr; - if (!VD && isOpenMPCapturedDecl(D) && !CurContext->isDependentContext()) - Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true); + if (!VD && isOpenMPCapturedDecl(D) && + !SemaRef.CurContext->isDependentContext()) + Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true); DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_shared, Ref); - Vars.push_back((VD || !Ref || CurContext->isDependentContext()) + Vars.push_back((VD || !Ref || SemaRef.CurContext->isDependentContext()) ? RefExpr->IgnoreParens() : Ref); } @@ -19079,7 +19152,8 @@ OMPClause *Sema::ActOnOpenMPSharedClause(ArrayRef VarList, if (Vars.empty()) return nullptr; - return OMPSharedClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars); + return OMPSharedClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc, + Vars); } namespace { @@ -20200,7 +20274,7 @@ static bool actOnOMPReductionKindClause( } else { VarsExpr = Ref = buildCapture(S, D, SimpleRefExpr, /*WithInit=*/false); } - if (!S.isOpenMPCapturedDecl(D)) { + if (!S.OpenMP().isOpenMPCapturedDecl(D)) { RD.ExprCaptures.emplace_back(Ref->getDecl()); if (Ref->getDecl()->hasAttr()) { ExprResult RefRes = S.DefaultLvalueConversion(Ref); @@ -20250,7 +20324,7 @@ static bool actOnOMPReductionKindClause( return RD.Vars.empty(); } -OMPClause *Sema::ActOnOpenMPReductionClause( +OMPClause *SemaOpenMP::ActOnOpenMPReductionClause( ArrayRef VarList, OpenMPReductionClauseModifier Modifier, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, @@ -20279,77 +20353,80 @@ OMPClause *Sema::ActOnOpenMPReductionClause( } ReductionData RD(VarList.size(), Modifier); - if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_reduction, VarList, + if (actOnOMPReductionKindClause(SemaRef, DSAStack, OMPC_reduction, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc, ReductionIdScopeSpec, ReductionId, UnresolvedReductions, RD)) return nullptr; return OMPReductionClause::Create( - Context, StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc, Modifier, - RD.Vars, ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId, + getASTContext(), StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc, + Modifier, RD.Vars, + ReductionIdScopeSpec.getWithLocInContext(getASTContext()), ReductionId, RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, RD.InscanCopyOps, RD.InscanCopyArrayTemps, RD.InscanCopyArrayElems, - buildPreInits(Context, RD.ExprCaptures), - buildPostUpdate(*this, RD.ExprPostUpdates)); + buildPreInits(getASTContext(), RD.ExprCaptures), + buildPostUpdate(SemaRef, RD.ExprPostUpdates)); } -OMPClause *Sema::ActOnOpenMPTaskReductionClause( +OMPClause *SemaOpenMP::ActOnOpenMPTaskReductionClause( ArrayRef VarList, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId, ArrayRef UnresolvedReductions) { ReductionData RD(VarList.size()); - if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_task_reduction, VarList, - StartLoc, LParenLoc, ColonLoc, EndLoc, - ReductionIdScopeSpec, ReductionId, + if (actOnOMPReductionKindClause(SemaRef, DSAStack, OMPC_task_reduction, + VarList, StartLoc, LParenLoc, ColonLoc, + EndLoc, ReductionIdScopeSpec, ReductionId, UnresolvedReductions, RD)) return nullptr; return OMPTaskReductionClause::Create( - Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars, - ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId, + getASTContext(), StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars, + ReductionIdScopeSpec.getWithLocInContext(getASTContext()), ReductionId, RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, - buildPreInits(Context, RD.ExprCaptures), - buildPostUpdate(*this, RD.ExprPostUpdates)); + buildPreInits(getASTContext(), RD.ExprCaptures), + buildPostUpdate(SemaRef, RD.ExprPostUpdates)); } -OMPClause *Sema::ActOnOpenMPInReductionClause( +OMPClause *SemaOpenMP::ActOnOpenMPInReductionClause( ArrayRef VarList, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId, ArrayRef UnresolvedReductions) { ReductionData RD(VarList.size()); - if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_in_reduction, VarList, + if (actOnOMPReductionKindClause(SemaRef, DSAStack, OMPC_in_reduction, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc, ReductionIdScopeSpec, ReductionId, UnresolvedReductions, RD)) return nullptr; return OMPInReductionClause::Create( - Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars, - ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId, + getASTContext(), StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars, + ReductionIdScopeSpec.getWithLocInContext(getASTContext()), ReductionId, RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, RD.TaskgroupDescriptors, - buildPreInits(Context, RD.ExprCaptures), - buildPostUpdate(*this, RD.ExprPostUpdates)); + buildPreInits(getASTContext(), RD.ExprCaptures), + buildPostUpdate(SemaRef, RD.ExprPostUpdates)); } -bool Sema::CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind, - SourceLocation LinLoc) { - if ((!LangOpts.CPlusPlus && LinKind != OMPC_LINEAR_val) || +bool SemaOpenMP::CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind, + SourceLocation LinLoc) { + if ((!getLangOpts().CPlusPlus && LinKind != OMPC_LINEAR_val) || LinKind == OMPC_LINEAR_unknown || LinKind == OMPC_LINEAR_step) { - Diag(LinLoc, diag::err_omp_wrong_linear_modifier) << LangOpts.CPlusPlus; + Diag(LinLoc, diag::err_omp_wrong_linear_modifier) + << getLangOpts().CPlusPlus; return true; } return false; } -bool Sema::CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc, - OpenMPLinearClauseKind LinKind, QualType Type, - bool IsDeclareSimd) { +bool SemaOpenMP::CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc, + OpenMPLinearClauseKind LinKind, + QualType Type, bool IsDeclareSimd) { const auto *VD = dyn_cast_or_null(D); // A variable must not have an incomplete type or a reference type. - if (RequireCompleteType(ELoc, Type, diag::err_omp_linear_incomplete_type)) + if (SemaRef.RequireCompleteType(ELoc, Type, + diag::err_omp_linear_incomplete_type)) return true; if ((LinKind == OMPC_LINEAR_uval || LinKind == OMPC_LINEAR_ref) && !Type->isReferenceType()) { @@ -20365,17 +20442,17 @@ bool Sema::CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc, // not apply to the firstprivate clause, nor to the linear clause on // declarative directives (like declare simd). if (!IsDeclareSimd && - rejectConstNotMutableType(*this, D, Type, OMPC_linear, ELoc)) + rejectConstNotMutableType(SemaRef, D, Type, OMPC_linear, ELoc)) return true; // A list item must be of integral or pointer type. Type = Type.getUnqualifiedType().getCanonicalType(); const auto *Ty = Type.getTypePtrOrNull(); if (!Ty || (LinKind != OMPC_LINEAR_ref && !Ty->isDependentType() && - !Ty->isIntegralType(Context) && !Ty->isPointerType())) { + !Ty->isIntegralType(getASTContext()) && !Ty->isPointerType())) { Diag(ELoc, diag::err_omp_linear_expected_int_or_ptr) << Type; if (D) { - bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) == + bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) == VarDecl::DeclarationOnly; Diag(D->getLocation(), IsDecl ? diag::note_previous_decl : diag::note_defined_here) @@ -20386,7 +20463,7 @@ bool Sema::CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc, return false; } -OMPClause *Sema::ActOnOpenMPLinearClause( +OMPClause *SemaOpenMP::ActOnOpenMPLinearClause( ArrayRef VarList, Expr *Step, SourceLocation StartLoc, SourceLocation LParenLoc, OpenMPLinearClauseKind LinKind, SourceLocation LinLoc, SourceLocation ColonLoc, @@ -20409,7 +20486,7 @@ OMPClause *Sema::ActOnOpenMPLinearClause( SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) { // It will be analyzed later. Vars.push_back(RefExpr); @@ -20431,7 +20508,7 @@ OMPClause *Sema::ActOnOpenMPLinearClause( if (DVar.RefExpr) { Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind) << getOpenMPClauseName(OMPC_linear); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } @@ -20441,29 +20518,29 @@ OMPClause *Sema::ActOnOpenMPLinearClause( // Build private copy of original var. VarDecl *Private = - buildVarDecl(*this, ELoc, Type, D->getName(), + buildVarDecl(SemaRef, ELoc, Type, D->getName(), D->hasAttrs() ? &D->getAttrs() : nullptr, VD ? cast(SimpleRefExpr) : nullptr); - DeclRefExpr *PrivateRef = buildDeclRefExpr(*this, Private, Type, ELoc); + DeclRefExpr *PrivateRef = buildDeclRefExpr(SemaRef, Private, Type, ELoc); // Build var to save initial value. - VarDecl *Init = buildVarDecl(*this, ELoc, Type, ".linear.start"); + VarDecl *Init = buildVarDecl(SemaRef, ELoc, Type, ".linear.start"); Expr *InitExpr; DeclRefExpr *Ref = nullptr; - if (!VD && !CurContext->isDependentContext()) { - Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false); + if (!VD && !SemaRef.CurContext->isDependentContext()) { + Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/false); if (!isOpenMPCapturedDecl(D)) { ExprCaptures.push_back(Ref->getDecl()); if (Ref->getDecl()->hasAttr()) { - ExprResult RefRes = DefaultLvalueConversion(Ref); + ExprResult RefRes = SemaRef.DefaultLvalueConversion(Ref); if (!RefRes.isUsable()) continue; ExprResult PostUpdateRes = - BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign, - SimpleRefExpr, RefRes.get()); + SemaRef.BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign, + SimpleRefExpr, RefRes.get()); if (!PostUpdateRes.isUsable()) continue; ExprPostUpdates.push_back( - IgnoredValueConversions(PostUpdateRes.get()).get()); + SemaRef.IgnoredValueConversions(PostUpdateRes.get()).get()); } } } @@ -20471,12 +20548,13 @@ OMPClause *Sema::ActOnOpenMPLinearClause( InitExpr = VD ? VD->getInit() : SimpleRefExpr; else InitExpr = VD ? SimpleRefExpr : Ref; - AddInitializerToDecl(Init, DefaultLvalueConversion(InitExpr).get(), - /*DirectInit=*/false); - DeclRefExpr *InitRef = buildDeclRefExpr(*this, Init, Type, ELoc); + SemaRef.AddInitializerToDecl( + Init, SemaRef.DefaultLvalueConversion(InitExpr).get(), + /*DirectInit=*/false); + DeclRefExpr *InitRef = buildDeclRefExpr(SemaRef, Init, Type, ELoc); DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_linear, Ref); - Vars.push_back((VD || CurContext->isDependentContext()) + Vars.push_back((VD || SemaRef.CurContext->isDependentContext()) ? RefExpr->IgnoreParens() : Ref); Privates.push_back(PrivateRef); @@ -20499,17 +20577,18 @@ OMPClause *Sema::ActOnOpenMPLinearClause( // Build var to save the step value. VarDecl *SaveVar = - buildVarDecl(*this, StepLoc, StepExpr->getType(), ".linear.step"); + buildVarDecl(SemaRef, StepLoc, StepExpr->getType(), ".linear.step"); ExprResult SaveRef = - buildDeclRefExpr(*this, SaveVar, StepExpr->getType(), StepLoc); - ExprResult CalcStep = - BuildBinOp(CurScope, StepLoc, BO_Assign, SaveRef.get(), StepExpr); - CalcStep = ActOnFinishFullExpr(CalcStep.get(), /*DiscardedValue*/ false); + buildDeclRefExpr(SemaRef, SaveVar, StepExpr->getType(), StepLoc); + ExprResult CalcStep = SemaRef.BuildBinOp( + SemaRef.getCurScope(), StepLoc, BO_Assign, SaveRef.get(), StepExpr); + CalcStep = + SemaRef.ActOnFinishFullExpr(CalcStep.get(), /*DiscardedValue*/ false); // Warn about zero linear step (it would be probably better specified as // making corresponding variables 'const'). if (std::optional Result = - StepExpr->getIntegerConstantExpr(Context)) { + StepExpr->getIntegerConstantExpr(getASTContext())) { if (!Result->isNegative() && !Result->isStrictlyPositive()) Diag(StepLoc, diag::warn_omp_linear_step_zero) << Vars[0] << (Vars.size() > 1); @@ -20520,11 +20599,11 @@ OMPClause *Sema::ActOnOpenMPLinearClause( } } - return OMPLinearClause::Create(Context, StartLoc, LParenLoc, LinKind, LinLoc, - ColonLoc, StepModifierLoc, EndLoc, Vars, - Privates, Inits, StepExpr, CalcStepExpr, - buildPreInits(Context, ExprCaptures), - buildPostUpdate(*this, ExprPostUpdates)); + return OMPLinearClause::Create(getASTContext(), StartLoc, LParenLoc, LinKind, + LinLoc, ColonLoc, StepModifierLoc, EndLoc, + Vars, Privates, Inits, StepExpr, CalcStepExpr, + buildPreInits(getASTContext(), ExprCaptures), + buildPostUpdate(SemaRef, ExprPostUpdates)); } static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV, @@ -20630,7 +20709,7 @@ static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV, return HasErrors; } -OMPClause *Sema::ActOnOpenMPAlignedClause( +OMPClause *SemaOpenMP::ActOnOpenMPAlignedClause( ArrayRef VarList, Expr *Alignment, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc) { SmallVector Vars; @@ -20639,7 +20718,7 @@ OMPClause *Sema::ActOnOpenMPAlignedClause( SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) { // It will be analyzed later. Vars.push_back(RefExpr); @@ -20659,7 +20738,7 @@ OMPClause *Sema::ActOnOpenMPAlignedClause( if (!Ty || (!Ty->isArrayType() && !Ty->isPointerType())) { Diag(ELoc, diag::err_omp_aligned_expected_array_or_ptr) << QType << getLangOpts().CPlusPlus << ERange; - bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) == + bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) == VarDecl::DeclarationOnly; Diag(D->getLocation(), IsDecl ? diag::note_previous_decl : diag::note_defined_here) @@ -20679,9 +20758,10 @@ OMPClause *Sema::ActOnOpenMPAlignedClause( DeclRefExpr *Ref = nullptr; if (!VD && isOpenMPCapturedDecl(D)) - Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true); - Vars.push_back(DefaultFunctionArrayConversion( - (VD || !Ref) ? RefExpr->IgnoreParens() : Ref) + Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true); + Vars.push_back(SemaRef + .DefaultFunctionArrayConversion( + (VD || !Ref) ? RefExpr->IgnoreParens() : Ref) .get()); } @@ -20700,14 +20780,14 @@ OMPClause *Sema::ActOnOpenMPAlignedClause( if (Vars.empty()) return nullptr; - return OMPAlignedClause::Create(Context, StartLoc, LParenLoc, ColonLoc, - EndLoc, Vars, Alignment); + return OMPAlignedClause::Create(getASTContext(), StartLoc, LParenLoc, + ColonLoc, EndLoc, Vars, Alignment); } -OMPClause *Sema::ActOnOpenMPCopyinClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPCopyinClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { SmallVector Vars; SmallVector SrcExprs; SmallVector DstExprs; @@ -20761,26 +20841,28 @@ OMPClause *Sema::ActOnOpenMPCopyinClause(ArrayRef VarList, // A variable of class type (or array thereof) that appears in a // copyin clause requires an accessible, unambiguous copy assignment // operator for the class type. - QualType ElemType = Context.getBaseElementType(Type).getNonReferenceType(); + QualType ElemType = + getASTContext().getBaseElementType(Type).getNonReferenceType(); VarDecl *SrcVD = - buildVarDecl(*this, DE->getBeginLoc(), ElemType.getUnqualifiedType(), + buildVarDecl(SemaRef, DE->getBeginLoc(), ElemType.getUnqualifiedType(), ".copyin.src", VD->hasAttrs() ? &VD->getAttrs() : nullptr); DeclRefExpr *PseudoSrcExpr = buildDeclRefExpr( - *this, SrcVD, ElemType.getUnqualifiedType(), DE->getExprLoc()); + SemaRef, SrcVD, ElemType.getUnqualifiedType(), DE->getExprLoc()); VarDecl *DstVD = - buildVarDecl(*this, DE->getBeginLoc(), ElemType, ".copyin.dst", + buildVarDecl(SemaRef, DE->getBeginLoc(), ElemType, ".copyin.dst", VD->hasAttrs() ? &VD->getAttrs() : nullptr); DeclRefExpr *PseudoDstExpr = - buildDeclRefExpr(*this, DstVD, ElemType, DE->getExprLoc()); + buildDeclRefExpr(SemaRef, DstVD, ElemType, DE->getExprLoc()); // For arrays generate assignment operation for single element and replace // it by the original array element in CodeGen. ExprResult AssignmentOp = - BuildBinOp(/*S=*/nullptr, DE->getExprLoc(), BO_Assign, PseudoDstExpr, - PseudoSrcExpr); + SemaRef.BuildBinOp(/*S=*/nullptr, DE->getExprLoc(), BO_Assign, + PseudoDstExpr, PseudoSrcExpr); if (AssignmentOp.isInvalid()) continue; - AssignmentOp = ActOnFinishFullExpr(AssignmentOp.get(), DE->getExprLoc(), - /*DiscardedValue*/ false); + AssignmentOp = + SemaRef.ActOnFinishFullExpr(AssignmentOp.get(), DE->getExprLoc(), + /*DiscardedValue*/ false); if (AssignmentOp.isInvalid()) continue; @@ -20794,14 +20876,14 @@ OMPClause *Sema::ActOnOpenMPCopyinClause(ArrayRef VarList, if (Vars.empty()) return nullptr; - return OMPCopyinClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars, - SrcExprs, DstExprs, AssignmentOps); + return OMPCopyinClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc, + Vars, SrcExprs, DstExprs, AssignmentOps); } -OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPCopyprivateClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { SmallVector Vars; SmallVector SrcExprs; SmallVector DstExprs; @@ -20811,7 +20893,7 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef VarList, SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) { // It will be analyzed later. Vars.push_back(RefExpr); @@ -20837,7 +20919,7 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef VarList, Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind) << getOpenMPClauseName(OMPC_copyprivate); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } @@ -20850,7 +20932,7 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef VarList, Diag(ELoc, diag::err_omp_required_access) << getOpenMPClauseName(OMPC_copyprivate) << "threadprivate or private in the enclosing context"; - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } } @@ -20861,7 +20943,7 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef VarList, Diag(ELoc, diag::err_omp_variably_modified_type_not_supported) << getOpenMPClauseName(OMPC_copyprivate) << Type << getOpenMPDirectiveName(DSAStack->getCurrentDirective()); - bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) == + bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) == VarDecl::DeclarationOnly; Diag(D->getLocation(), IsDecl ? diag::note_previous_decl : diag::note_defined_here) @@ -20873,22 +20955,23 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef VarList, // A variable of class type (or array thereof) that appears in a // copyin clause requires an accessible, unambiguous copy assignment // operator for the class type. - Type = Context.getBaseElementType(Type.getNonReferenceType()) + Type = getASTContext() + .getBaseElementType(Type.getNonReferenceType()) .getUnqualifiedType(); VarDecl *SrcVD = - buildVarDecl(*this, RefExpr->getBeginLoc(), Type, ".copyprivate.src", + buildVarDecl(SemaRef, RefExpr->getBeginLoc(), Type, ".copyprivate.src", D->hasAttrs() ? &D->getAttrs() : nullptr); - DeclRefExpr *PseudoSrcExpr = buildDeclRefExpr(*this, SrcVD, Type, ELoc); + DeclRefExpr *PseudoSrcExpr = buildDeclRefExpr(SemaRef, SrcVD, Type, ELoc); VarDecl *DstVD = - buildVarDecl(*this, RefExpr->getBeginLoc(), Type, ".copyprivate.dst", + buildVarDecl(SemaRef, RefExpr->getBeginLoc(), Type, ".copyprivate.dst", D->hasAttrs() ? &D->getAttrs() : nullptr); - DeclRefExpr *PseudoDstExpr = buildDeclRefExpr(*this, DstVD, Type, ELoc); - ExprResult AssignmentOp = BuildBinOp( + DeclRefExpr *PseudoDstExpr = buildDeclRefExpr(SemaRef, DstVD, Type, ELoc); + ExprResult AssignmentOp = SemaRef.BuildBinOp( DSAStack->getCurScope(), ELoc, BO_Assign, PseudoDstExpr, PseudoSrcExpr); if (AssignmentOp.isInvalid()) continue; - AssignmentOp = - ActOnFinishFullExpr(AssignmentOp.get(), ELoc, /*DiscardedValue*/ false); + AssignmentOp = SemaRef.ActOnFinishFullExpr(AssignmentOp.get(), ELoc, + /*DiscardedValue*/ false); if (AssignmentOp.isInvalid()) continue; @@ -20897,7 +20980,7 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef VarList, assert(VD || isOpenMPCapturedDecl(D)); Vars.push_back( VD ? RefExpr->IgnoreParens() - : buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false)); + : buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/false)); SrcExprs.push_back(PseudoSrcExpr); DstExprs.push_back(PseudoDstExpr); AssignmentOps.push_back(AssignmentOp.get()); @@ -20906,18 +20989,20 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef VarList, if (Vars.empty()) return nullptr; - return OMPCopyprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc, - Vars, SrcExprs, DstExprs, AssignmentOps); + return OMPCopyprivateClause::Create(getASTContext(), StartLoc, LParenLoc, + EndLoc, Vars, SrcExprs, DstExprs, + AssignmentOps); } -OMPClause *Sema::ActOnOpenMPFlushClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPFlushClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (VarList.empty()) return nullptr; - return OMPFlushClause::Create(Context, StartLoc, LParenLoc, EndLoc, VarList); + return OMPFlushClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc, + VarList); } /// Tries to find omp_depend_t. type. @@ -20937,22 +21022,23 @@ static bool findOMPDependT(Sema &S, SourceLocation Loc, DSAStackTy *Stack, return true; } -OMPClause *Sema::ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPDepobjClause(Expr *Depobj, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (!Depobj) return nullptr; - bool OMPDependTFound = findOMPDependT(*this, StartLoc, DSAStack); + bool OMPDependTFound = findOMPDependT(SemaRef, StartLoc, DSAStack); // OpenMP 5.0, 2.17.10.1 depobj Construct // depobj is an lvalue expression of type omp_depend_t. if (!Depobj->isTypeDependent() && !Depobj->isValueDependent() && !Depobj->isInstantiationDependent() && !Depobj->containsUnexpandedParameterPack() && - (OMPDependTFound && - !Context.typesAreCompatible(DSAStack->getOMPDependT(), Depobj->getType(), - /*CompareUnqualified=*/true))) { + (OMPDependTFound && !getASTContext().typesAreCompatible( + DSAStack->getOMPDependT(), Depobj->getType(), + /*CompareUnqualified=*/true))) { Diag(Depobj->getExprLoc(), diag::err_omp_expected_omp_depend_t_lvalue) << 0 << Depobj->getType() << Depobj->getSourceRange(); } @@ -20962,7 +21048,8 @@ OMPClause *Sema::ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc, << 1 << Depobj->getSourceRange(); } - return OMPDepobjClause::Create(Context, StartLoc, LParenLoc, EndLoc, Depobj); + return OMPDepobjClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc, + Depobj); } namespace { @@ -21062,8 +21149,9 @@ ProcessOpenMPDoacrossClauseCommon(Sema &SemaRef, bool IsSource, continue; } if (RHS) { - ExprResult RHSRes = SemaRef.VerifyPositiveIntegerConstantInClause( - RHS, OMPC_depend, /*StrictlyPositive=*/false); + ExprResult RHSRes = + SemaRef.OpenMP().VerifyPositiveIntegerConstantInClause( + RHS, OMPC_depend, /*StrictlyPositive=*/false); if (RHSRes.isInvalid()) continue; } @@ -21094,11 +21182,10 @@ ProcessOpenMPDoacrossClauseCommon(Sema &SemaRef, bool IsSource, return {Vars, OpsOffs, TotalDepCount}; } -OMPClause * -Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, - Expr *DepModifier, ArrayRef VarList, - SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPDependClause( + const OMPDependClause::DependDataTy &Data, Expr *DepModifier, + ArrayRef VarList, SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation EndLoc) { OpenMPDependClauseKind DepKind = Data.DepKind; SourceLocation DepLoc = Data.DepLoc; if (DSAStack->getCurrentDirective() == OMPD_ordered && @@ -21116,17 +21203,18 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, DSAStack->getCurrentDirective() == OMPD_depobj) && (DepKind == OMPC_DEPEND_unknown || DepKind == OMPC_DEPEND_source || DepKind == OMPC_DEPEND_sink || - ((LangOpts.OpenMP < 50 || + ((getLangOpts().OpenMP < 50 || DSAStack->getCurrentDirective() == OMPD_depobj) && DepKind == OMPC_DEPEND_depobj))) { SmallVector Except = {OMPC_DEPEND_source, OMPC_DEPEND_sink, OMPC_DEPEND_outallmemory, OMPC_DEPEND_inoutallmemory}; - if (LangOpts.OpenMP < 50 || DSAStack->getCurrentDirective() == OMPD_depobj) + if (getLangOpts().OpenMP < 50 || + DSAStack->getCurrentDirective() == OMPD_depobj) Except.push_back(OMPC_DEPEND_depobj); - if (LangOpts.OpenMP < 51) + if (getLangOpts().OpenMP < 51) Except.push_back(OMPC_DEPEND_inoutset); - std::string Expected = (LangOpts.OpenMP >= 50 && !DepModifier) + std::string Expected = (getLangOpts().OpenMP >= 50 && !DepModifier) ? "depend modifier(iterator) or " : ""; Diag(DepLoc, diag::err_omp_unexpected_clause_value) @@ -21152,7 +21240,7 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, if (DepKind == OMPC_DEPEND_sink || DepKind == OMPC_DEPEND_source) { DoacrossDataInfoTy VarOffset = ProcessOpenMPDoacrossClauseCommon( - *this, DepKind == OMPC_DEPEND_source, VarList, DSAStack, EndLoc); + SemaRef, DepKind == OMPC_DEPEND_source, VarList, DSAStack, EndLoc); Vars = VarOffset.Vars; OpsOffs = VarOffset.OpsOffs; TotalDepCount = VarOffset.TotalDepCount; @@ -21168,9 +21256,9 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, SourceLocation ELoc = RefExpr->getExprLoc(); Expr *SimpleExpr = RefExpr->IgnoreParenCasts(); if (DepKind != OMPC_DEPEND_sink && DepKind != OMPC_DEPEND_source) { - bool OMPDependTFound = LangOpts.OpenMP >= 50; + bool OMPDependTFound = getLangOpts().OpenMP >= 50; if (OMPDependTFound) - OMPDependTFound = findOMPDependT(*this, StartLoc, DSAStack, + OMPDependTFound = findOMPDependT(SemaRef, StartLoc, DSAStack, DepKind == OMPC_DEPEND_depobj); if (DepKind == OMPC_DEPEND_depobj) { // OpenMP 5.0, 2.17.11 depend Clause, Restrictions, C/C++ @@ -21180,8 +21268,8 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, !RefExpr->isInstantiationDependent() && !RefExpr->containsUnexpandedParameterPack() && (OMPDependTFound && - !Context.hasSameUnqualifiedType(DSAStack->getOMPDependT(), - RefExpr->getType()))) { + !getASTContext().hasSameUnqualifiedType( + DSAStack->getOMPDependT(), RefExpr->getType()))) { Diag(ELoc, diag::err_omp_expected_omp_depend_t_lvalue) << 0 << RefExpr->getType() << RefExpr->getSourceRange(); continue; @@ -21212,7 +21300,7 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, const Expr *Length = OASE->getLength(); Expr::EvalResult Result; if (Length && !Length->isValueDependent() && - Length->EvaluateAsInt(Result, Context) && + Length->EvaluateAsInt(Result, getASTContext()) && Result.Val.getInt().isZero()) { Diag(ELoc, diag::err_omp_depend_zero_length_array_section_not_allowed) @@ -21232,8 +21320,9 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, (OMPDependTFound && DSAStack->getOMPDependT().getTypePtr() == ExprTy.getTypePtr()))) { Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item) - << (LangOpts.OpenMP >= 50 ? 1 : 0) - << (LangOpts.OpenMP >= 50 ? 1 : 0) << RefExpr->getSourceRange(); + << (getLangOpts().OpenMP >= 50 ? 1 : 0) + << (getLangOpts().OpenMP >= 50 ? 1 : 0) + << RefExpr->getSourceRange(); continue; } @@ -21245,22 +21334,24 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, ->isPointerType() && !ASE->getBase()->getType().getNonReferenceType()->isArrayType()) { Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item) - << (LangOpts.OpenMP >= 50 ? 1 : 0) - << (LangOpts.OpenMP >= 50 ? 1 : 0) << RefExpr->getSourceRange(); + << (getLangOpts().OpenMP >= 50 ? 1 : 0) + << (getLangOpts().OpenMP >= 50 ? 1 : 0) + << RefExpr->getSourceRange(); continue; } ExprResult Res; { - Sema::TentativeAnalysisScope Trap(*this); - Res = CreateBuiltinUnaryOp(ELoc, UO_AddrOf, - RefExpr->IgnoreParenImpCasts()); + Sema::TentativeAnalysisScope Trap(SemaRef); + Res = SemaRef.CreateBuiltinUnaryOp(ELoc, UO_AddrOf, + RefExpr->IgnoreParenImpCasts()); } if (!Res.isUsable() && !isa(SimpleExpr) && !isa(SimpleExpr)) { Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item) - << (LangOpts.OpenMP >= 50 ? 1 : 0) - << (LangOpts.OpenMP >= 50 ? 1 : 0) << RefExpr->getSourceRange(); + << (getLangOpts().OpenMP >= 50 ? 1 : 0) + << (getLangOpts().OpenMP >= 50 ? 1 : 0) + << RefExpr->getSourceRange(); continue; } } @@ -21275,7 +21366,7 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, return nullptr; auto *C = OMPDependClause::Create( - Context, StartLoc, LParenLoc, EndLoc, + getASTContext(), StartLoc, LParenLoc, EndLoc, {DepKind, DepLoc, Data.ColonLoc, Data.OmpAllMemoryLoc}, DepModifier, Vars, TotalDepCount.getZExtValue()); if ((DepKind == OMPC_DEPEND_sink || DepKind == OMPC_DEPEND_source) && @@ -21284,12 +21375,11 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data, return C; } -OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier, - Expr *Device, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation ModifierLoc, - SourceLocation EndLoc) { - assert((ModifierLoc.isInvalid() || LangOpts.OpenMP >= 50) && +OMPClause *SemaOpenMP::ActOnOpenMPDeviceClause( + OpenMPDeviceClauseModifier Modifier, Expr *Device, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation ModifierLoc, + SourceLocation EndLoc) { + assert((ModifierLoc.isInvalid() || getLangOpts().OpenMP >= 50) && "Unexpected device modifier in OpenMP < 50."); bool ErrorFound = false; @@ -21306,7 +21396,7 @@ OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier, // OpenMP [2.9.1, Restrictions] // The device expression must evaluate to a non-negative integer value. - ErrorFound = !isNonNegativeIntegerValue(ValExpr, *this, OMPC_device, + ErrorFound = !isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_device, /*StrictlyPositive=*/false) || ErrorFound; if (ErrorFound) @@ -21317,7 +21407,7 @@ OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier, // the reverse_offload clause must be specified. if (Modifier == OMPC_DEVICE_ancestor) { if (!DSAStack->hasRequiresDeclWithClause()) { - targetDiag( + SemaRef.targetDiag( StartLoc, diag::err_omp_device_ancestor_without_requires_reverse_offload); ErrorFound = true; @@ -21326,15 +21416,16 @@ OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier, OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); OpenMPDirectiveKind CaptureRegion = - getOpenMPCaptureRegionForClause(DKind, OMPC_device, LangOpts.OpenMP); - if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + getOpenMPCaptureRegionForClause(DKind, OMPC_device, getLangOpts().OpenMP); + if (CaptureRegion != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } - return new (Context) + return new (getASTContext()) OMPDeviceClause(Modifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, ModifierLoc, EndLoc); } @@ -22527,7 +22618,7 @@ static void checkMappableExpressionList( } } -OMPClause *Sema::ActOnOpenMPMapClause( +OMPClause *SemaOpenMP::ActOnOpenMPMapClause( Expr *IteratorModifier, ArrayRef MapTypeModifiers, ArrayRef MapTypeModifiersLoc, CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, @@ -22562,7 +22653,7 @@ OMPClause *Sema::ActOnOpenMPMapClause( } MappableVarListInfo MVLI(VarList); - checkMappableExpressionList(*this, DSAStack, OMPC_map, MVLI, Locs.StartLoc, + checkMappableExpressionList(SemaRef, DSAStack, OMPC_map, MVLI, Locs.StartLoc, MapperIdScopeSpec, MapperId, UnresolvedMappers, MapType, Modifiers, IsMapTypeImplicit, NoDiagnose); @@ -22570,17 +22661,17 @@ OMPClause *Sema::ActOnOpenMPMapClause( // We need to produce a map clause even if we don't have variables so that // other diagnostics related with non-existing map clauses are accurate. return OMPMapClause::Create( - Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, + getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, MVLI.VarComponents, MVLI.UDMapperList, IteratorModifier, Modifiers, - ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(Context), MapperId, - MapType, IsMapTypeImplicit, MapLoc); + ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(getASTContext()), + MapperId, MapType, IsMapTypeImplicit, MapLoc); } -QualType Sema::ActOnOpenMPDeclareReductionType(SourceLocation TyLoc, - TypeResult ParsedType) { +QualType SemaOpenMP::ActOnOpenMPDeclareReductionType(SourceLocation TyLoc, + TypeResult ParsedType) { assert(ParsedType.isUsable()); - QualType ReductionType = GetTypeFromParser(ParsedType.get()); + QualType ReductionType = SemaRef.GetTypeFromParser(ParsedType.get()); if (ReductionType.isNull()) return QualType(); @@ -22608,15 +22699,17 @@ QualType Sema::ActOnOpenMPDeclareReductionType(SourceLocation TyLoc, return ReductionType; } -Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart( +SemaOpenMP::DeclGroupPtrTy +SemaOpenMP::ActOnOpenMPDeclareReductionDirectiveStart( Scope *S, DeclContext *DC, DeclarationName Name, ArrayRef> ReductionTypes, AccessSpecifier AS, Decl *PrevDeclInScope) { SmallVector Decls; Decls.reserve(ReductionTypes.size()); - LookupResult Lookup(*this, Name, SourceLocation(), LookupOMPReductionName, - forRedeclarationInCurContext()); + LookupResult Lookup(SemaRef, Name, SourceLocation(), + Sema::LookupOMPReductionName, + SemaRef.forRedeclarationInCurContext()); // [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions // A reduction-identifier may not be re-declared in the current scope for the // same type or for a type that is compatible according to the base language @@ -22627,12 +22720,12 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart( if (S != nullptr) { // Find previous declaration with the same name not referenced in other // declarations. - FunctionScopeInfo *ParentFn = getEnclosingFunction(); + FunctionScopeInfo *ParentFn = SemaRef.getEnclosingFunction(); InCompoundScope = (ParentFn != nullptr) && !ParentFn->CompoundScopes.empty(); - LookupName(Lookup, S); - FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false, - /*AllowInlineNamespace=*/false); + SemaRef.LookupName(Lookup, S); + SemaRef.FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false, + /*AllowInlineNamespace=*/false); llvm::DenseMap UsedAsPrevious; LookupResult::Filter Filter = Lookup.makeFilter(); while (Filter.hasNext()) { @@ -22675,8 +22768,8 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart( Invalid = true; } PreviousRedeclTypes[TyData.first.getCanonicalType()] = TyData.second; - auto *DRD = OMPDeclareReductionDecl::Create(Context, DC, TyData.second, - Name, TyData.first, PrevDRD); + auto *DRD = OMPDeclareReductionDecl::Create( + getASTContext(), DC, TyData.second, Name, TyData.first, PrevDRD); DC->addDecl(DRD); DRD->setAccess(AS); Decls.push_back(DRD); @@ -22687,24 +22780,24 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart( } return DeclGroupPtrTy::make( - DeclGroupRef::Create(Context, Decls.begin(), Decls.size())); + DeclGroupRef::Create(getASTContext(), Decls.begin(), Decls.size())); } -void Sema::ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D) { +void SemaOpenMP::ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D) { auto *DRD = cast(D); // Enter new function scope. - PushFunctionScope(); - setFunctionHasBranchProtectedScope(); - getCurFunction()->setHasOMPDeclareReductionCombiner(); + SemaRef.PushFunctionScope(); + SemaRef.setFunctionHasBranchProtectedScope(); + SemaRef.getCurFunction()->setHasOMPDeclareReductionCombiner(); if (S != nullptr) - PushDeclContext(S, DRD); + SemaRef.PushDeclContext(S, DRD); else - CurContext = DRD; + SemaRef.CurContext = DRD; - PushExpressionEvaluationContext( - ExpressionEvaluationContext::PotentiallyEvaluated); + SemaRef.PushExpressionEvaluationContext( + Sema::ExpressionEvaluationContext::PotentiallyEvaluated); QualType ReductionType = DRD->getType(); // Create 'T* omp_parm;T omp_in;'. All references to 'omp_in' will @@ -22714,7 +22807,7 @@ void Sema::ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D) { // pointers. // Create 'T omp_in;' variable. VarDecl *OmpInParm = - buildVarDecl(*this, D->getLocation(), ReductionType, "omp_in"); + buildVarDecl(SemaRef, D->getLocation(), ReductionType, "omp_in"); // Create 'T* omp_parm;T omp_out;'. All references to 'omp_out' will // be replaced by '*omp_parm' during codegen. This required because 'omp_out' // uses semantics of argument handles by value, but it should be passed by @@ -22722,28 +22815,29 @@ void Sema::ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D) { // pointers. // Create 'T omp_out;' variable. VarDecl *OmpOutParm = - buildVarDecl(*this, D->getLocation(), ReductionType, "omp_out"); + buildVarDecl(SemaRef, D->getLocation(), ReductionType, "omp_out"); if (S != nullptr) { - PushOnScopeChains(OmpInParm, S); - PushOnScopeChains(OmpOutParm, S); + SemaRef.PushOnScopeChains(OmpInParm, S); + SemaRef.PushOnScopeChains(OmpOutParm, S); } else { DRD->addDecl(OmpInParm); DRD->addDecl(OmpOutParm); } Expr *InE = - ::buildDeclRefExpr(*this, OmpInParm, ReductionType, D->getLocation()); + ::buildDeclRefExpr(SemaRef, OmpInParm, ReductionType, D->getLocation()); Expr *OutE = - ::buildDeclRefExpr(*this, OmpOutParm, ReductionType, D->getLocation()); + ::buildDeclRefExpr(SemaRef, OmpOutParm, ReductionType, D->getLocation()); DRD->setCombinerData(InE, OutE); } -void Sema::ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner) { +void SemaOpenMP::ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, + Expr *Combiner) { auto *DRD = cast(D); - DiscardCleanupsInEvaluationContext(); - PopExpressionEvaluationContext(); + SemaRef.DiscardCleanupsInEvaluationContext(); + SemaRef.PopExpressionEvaluationContext(); - PopDeclContext(); - PopFunctionScopeInfo(); + SemaRef.PopDeclContext(); + SemaRef.PopFunctionScopeInfo(); if (Combiner != nullptr) DRD->setCombiner(Combiner); @@ -22751,20 +22845,21 @@ void Sema::ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner) { DRD->setInvalidDecl(); } -VarDecl *Sema::ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D) { +VarDecl *SemaOpenMP::ActOnOpenMPDeclareReductionInitializerStart(Scope *S, + Decl *D) { auto *DRD = cast(D); // Enter new function scope. - PushFunctionScope(); - setFunctionHasBranchProtectedScope(); + SemaRef.PushFunctionScope(); + SemaRef.setFunctionHasBranchProtectedScope(); if (S != nullptr) - PushDeclContext(S, DRD); + SemaRef.PushDeclContext(S, DRD); else - CurContext = DRD; + SemaRef.CurContext = DRD; - PushExpressionEvaluationContext( - ExpressionEvaluationContext::PotentiallyEvaluated); + SemaRef.PushExpressionEvaluationContext( + Sema::ExpressionEvaluationContext::PotentiallyEvaluated); QualType ReductionType = DRD->getType(); // Create 'T* omp_parm;T omp_priv;'. All references to 'omp_priv' will @@ -22774,7 +22869,7 @@ VarDecl *Sema::ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D) { // pointers. // Create 'T omp_priv;' variable. VarDecl *OmpPrivParm = - buildVarDecl(*this, D->getLocation(), ReductionType, "omp_priv"); + buildVarDecl(SemaRef, D->getLocation(), ReductionType, "omp_priv"); // Create 'T* omp_parm;T omp_orig;'. All references to 'omp_orig' will // be replaced by '*omp_parm' during codegen. This required because 'omp_orig' // uses semantics of argument handles by value, but it should be passed by @@ -22782,30 +22877,30 @@ VarDecl *Sema::ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D) { // pointers. // Create 'T omp_orig;' variable. VarDecl *OmpOrigParm = - buildVarDecl(*this, D->getLocation(), ReductionType, "omp_orig"); + buildVarDecl(SemaRef, D->getLocation(), ReductionType, "omp_orig"); if (S != nullptr) { - PushOnScopeChains(OmpPrivParm, S); - PushOnScopeChains(OmpOrigParm, S); + SemaRef.PushOnScopeChains(OmpPrivParm, S); + SemaRef.PushOnScopeChains(OmpOrigParm, S); } else { DRD->addDecl(OmpPrivParm); DRD->addDecl(OmpOrigParm); } Expr *OrigE = - ::buildDeclRefExpr(*this, OmpOrigParm, ReductionType, D->getLocation()); + ::buildDeclRefExpr(SemaRef, OmpOrigParm, ReductionType, D->getLocation()); Expr *PrivE = - ::buildDeclRefExpr(*this, OmpPrivParm, ReductionType, D->getLocation()); + ::buildDeclRefExpr(SemaRef, OmpPrivParm, ReductionType, D->getLocation()); DRD->setInitializerData(OrigE, PrivE); return OmpPrivParm; } -void Sema::ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer, - VarDecl *OmpPrivParm) { +void SemaOpenMP::ActOnOpenMPDeclareReductionInitializerEnd( + Decl *D, Expr *Initializer, VarDecl *OmpPrivParm) { auto *DRD = cast(D); - DiscardCleanupsInEvaluationContext(); - PopExpressionEvaluationContext(); + SemaRef.DiscardCleanupsInEvaluationContext(); + SemaRef.PopExpressionEvaluationContext(); - PopDeclContext(); - PopFunctionScopeInfo(); + SemaRef.PopDeclContext(); + SemaRef.PopFunctionScopeInfo(); if (Initializer != nullptr) { DRD->setInitializer(Initializer, OMPDeclareReductionInitKind::Call); @@ -22819,13 +22914,13 @@ void Sema::ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer, } } -Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveEnd( +SemaOpenMP::DeclGroupPtrTy SemaOpenMP::ActOnOpenMPDeclareReductionDirectiveEnd( Scope *S, DeclGroupPtrTy DeclReductions, bool IsValid) { for (Decl *D : DeclReductions.get()) { if (IsValid) { if (S) - PushOnScopeChains(cast(D), S, - /*AddToContext=*/false); + SemaRef.PushOnScopeChains(cast(D), S, + /*AddToContext=*/false); } else { D->setInvalidDecl(); } @@ -22833,25 +22928,26 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveEnd( return DeclReductions; } -TypeResult Sema::ActOnOpenMPDeclareMapperVarDecl(Scope *S, Declarator &D) { - TypeSourceInfo *TInfo = GetTypeForDeclarator(D); +TypeResult SemaOpenMP::ActOnOpenMPDeclareMapperVarDecl(Scope *S, + Declarator &D) { + TypeSourceInfo *TInfo = SemaRef.GetTypeForDeclarator(D); QualType T = TInfo->getType(); if (D.isInvalidType()) return true; if (getLangOpts().CPlusPlus) { // Check that there are no default arguments (C++ only). - CheckExtraCXXDefaultArguments(D); + SemaRef.CheckExtraCXXDefaultArguments(D); } - return CreateParsedType(T, TInfo); + return SemaRef.CreateParsedType(T, TInfo); } -QualType Sema::ActOnOpenMPDeclareMapperType(SourceLocation TyLoc, - TypeResult ParsedType) { +QualType SemaOpenMP::ActOnOpenMPDeclareMapperType(SourceLocation TyLoc, + TypeResult ParsedType) { assert(ParsedType.isUsable() && "Expect usable parsed mapper type"); - QualType MapperType = GetTypeFromParser(ParsedType.get()); + QualType MapperType = SemaRef.GetTypeFromParser(ParsedType.get()); assert(!MapperType.isNull() && "Expect valid mapper type"); // [OpenMP 5.0], 2.19.7.3 declare mapper Directive, Restrictions @@ -22863,12 +22959,13 @@ QualType Sema::ActOnOpenMPDeclareMapperType(SourceLocation TyLoc, return MapperType; } -Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareMapperDirective( +SemaOpenMP::DeclGroupPtrTy SemaOpenMP::ActOnOpenMPDeclareMapperDirective( Scope *S, DeclContext *DC, DeclarationName Name, QualType MapperType, SourceLocation StartLoc, DeclarationName VN, AccessSpecifier AS, Expr *MapperVarRef, ArrayRef Clauses, Decl *PrevDeclInScope) { - LookupResult Lookup(*this, Name, SourceLocation(), LookupOMPMapperName, - forRedeclarationInCurContext()); + LookupResult Lookup(SemaRef, Name, SourceLocation(), + Sema::LookupOMPMapperName, + SemaRef.forRedeclarationInCurContext()); // [OpenMP 5.0], 2.19.7.3 declare mapper Directive, Restrictions // A mapper-identifier may not be redeclared in the current scope for the // same type or for a type that is compatible according to the base language @@ -22879,12 +22976,12 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareMapperDirective( if (S != nullptr) { // Find previous declaration with the same name not referenced in other // declarations. - FunctionScopeInfo *ParentFn = getEnclosingFunction(); + FunctionScopeInfo *ParentFn = SemaRef.getEnclosingFunction(); InCompoundScope = (ParentFn != nullptr) && !ParentFn->CompoundScopes.empty(); - LookupName(Lookup, S); - FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false, - /*AllowInlineNamespace=*/false); + SemaRef.LookupName(Lookup, S); + SemaRef.FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false, + /*AllowInlineNamespace=*/false); llvm::DenseMap UsedAsPrevious; LookupResult::Filter Filter = Lookup.makeFilter(); while (Filter.hasNext()) { @@ -22929,13 +23026,14 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareMapperDirective( // mappers. SmallVector ClausesWithImplicit(Clauses.begin(), Clauses.end()); - if (LangOpts.OpenMP >= 50) - processImplicitMapsWithDefaultMappers(*this, DSAStack, ClausesWithImplicit); - auto *DMD = - OMPDeclareMapperDecl::Create(Context, DC, StartLoc, Name, MapperType, VN, - ClausesWithImplicit, PrevDMD); + if (getLangOpts().OpenMP >= 50) + processImplicitMapsWithDefaultMappers(SemaRef, DSAStack, + ClausesWithImplicit); + auto *DMD = OMPDeclareMapperDecl::Create(getASTContext(), DC, StartLoc, Name, + MapperType, VN, ClausesWithImplicit, + PrevDMD); if (S) - PushOnScopeChains(DMD, S); + SemaRef.PushOnScopeChains(DMD, S); else DC->addDecl(DMD); DMD->setAccess(AS); @@ -22951,105 +23049,106 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareMapperDirective( return DeclGroupPtrTy::make(DeclGroupRef(DMD)); } -ExprResult -Sema::ActOnOpenMPDeclareMapperDirectiveVarDecl(Scope *S, QualType MapperType, - SourceLocation StartLoc, - DeclarationName VN) { +ExprResult SemaOpenMP::ActOnOpenMPDeclareMapperDirectiveVarDecl( + Scope *S, QualType MapperType, SourceLocation StartLoc, + DeclarationName VN) { TypeSourceInfo *TInfo = - Context.getTrivialTypeSourceInfo(MapperType, StartLoc); - auto *VD = VarDecl::Create(Context, Context.getTranslationUnitDecl(), - StartLoc, StartLoc, VN.getAsIdentifierInfo(), - MapperType, TInfo, SC_None); + getASTContext().getTrivialTypeSourceInfo(MapperType, StartLoc); + auto *VD = VarDecl::Create( + getASTContext(), getASTContext().getTranslationUnitDecl(), StartLoc, + StartLoc, VN.getAsIdentifierInfo(), MapperType, TInfo, SC_None); if (S) - PushOnScopeChains(VD, S, /*AddToContext=*/false); - Expr *E = buildDeclRefExpr(*this, VD, MapperType, StartLoc); + SemaRef.PushOnScopeChains(VD, S, /*AddToContext=*/false); + Expr *E = buildDeclRefExpr(SemaRef, VD, MapperType, StartLoc); DSAStack->addDeclareMapperVarRef(E); return E; } -void Sema::ActOnOpenMPIteratorVarDecl(VarDecl *VD) { +void SemaOpenMP::ActOnOpenMPIteratorVarDecl(VarDecl *VD) { if (DSAStack->getDeclareMapperVarRef()) DSAStack->addIteratorVarDecl(VD); } -bool Sema::isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const { - assert(LangOpts.OpenMP && "Expected OpenMP mode."); +bool SemaOpenMP::isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const { + assert(getLangOpts().OpenMP && "Expected OpenMP mode."); const Expr *Ref = DSAStack->getDeclareMapperVarRef(); if (const auto *DRE = cast_or_null(Ref)) { if (VD->getCanonicalDecl() == DRE->getDecl()->getCanonicalDecl()) return true; - if (VD->isUsableInConstantExpressions(Context)) + if (VD->isUsableInConstantExpressions(getASTContext())) return true; - if (LangOpts.OpenMP >= 52 && DSAStack->isIteratorVarDecl(VD)) + if (getLangOpts().OpenMP >= 52 && DSAStack->isIteratorVarDecl(VD)) return true; return false; } return true; } -const ValueDecl *Sema::getOpenMPDeclareMapperVarName() const { - assert(LangOpts.OpenMP && "Expected OpenMP mode."); +const ValueDecl *SemaOpenMP::getOpenMPDeclareMapperVarName() const { + assert(getLangOpts().OpenMP && "Expected OpenMP mode."); return cast(DSAStack->getDeclareMapperVarRef())->getDecl(); } -OMPClause *Sema::ActOnOpenMPNumTeamsClause(Expr *NumTeams, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPNumTeamsClause(Expr *NumTeams, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { Expr *ValExpr = NumTeams; Stmt *HelperValStmt = nullptr; // OpenMP [teams Constrcut, Restrictions] // The num_teams expression must evaluate to a positive integer value. - if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_num_teams, + if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_num_teams, /*StrictlyPositive=*/true)) return nullptr; OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); - OpenMPDirectiveKind CaptureRegion = - getOpenMPCaptureRegionForClause(DKind, OMPC_num_teams, LangOpts.OpenMP); - if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause( + DKind, OMPC_num_teams, getLangOpts().OpenMP); + if (CaptureRegion != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } - return new (Context) OMPNumTeamsClause(ValExpr, HelperValStmt, CaptureRegion, - StartLoc, LParenLoc, EndLoc); + return new (getASTContext()) OMPNumTeamsClause( + ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPThreadLimitClause(Expr *ThreadLimit, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPThreadLimitClause(Expr *ThreadLimit, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { Expr *ValExpr = ThreadLimit; Stmt *HelperValStmt = nullptr; // OpenMP [teams Constrcut, Restrictions] // The thread_limit expression must evaluate to a positive integer value. - if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_thread_limit, + if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_thread_limit, /*StrictlyPositive=*/true)) return nullptr; OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause( - DKind, OMPC_thread_limit, LangOpts.OpenMP); - if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + DKind, OMPC_thread_limit, getLangOpts().OpenMP); + if (CaptureRegion != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } - return new (Context) OMPThreadLimitClause( + return new (getASTContext()) OMPThreadLimitClause( ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPPriorityClause(Expr *Priority, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPPriorityClause(Expr *Priority, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { Expr *ValExpr = Priority; Stmt *HelperValStmt = nullptr; OpenMPDirectiveKind CaptureRegion = OMPD_unknown; @@ -23057,20 +23156,20 @@ OMPClause *Sema::ActOnOpenMPPriorityClause(Expr *Priority, // OpenMP [2.9.1, task Constrcut] // The priority-value is a non-negative numerical scalar expression. if (!isNonNegativeIntegerValue( - ValExpr, *this, OMPC_priority, + ValExpr, SemaRef, OMPC_priority, /*StrictlyPositive=*/false, /*BuildCapture=*/true, DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt)) return nullptr; - return new (Context) OMPPriorityClause(ValExpr, HelperValStmt, CaptureRegion, - StartLoc, LParenLoc, EndLoc); + return new (getASTContext()) OMPPriorityClause( + ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPGrainsizeClause( +OMPClause *SemaOpenMP::ActOnOpenMPGrainsizeClause( OpenMPGrainsizeClauseModifier Modifier, Expr *Grainsize, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ModifierLoc, SourceLocation EndLoc) { - assert((ModifierLoc.isInvalid() || LangOpts.OpenMP >= 51) && + assert((ModifierLoc.isInvalid() || getLangOpts().OpenMP >= 51) && "Unexpected grainsize modifier in OpenMP < 51."); if (ModifierLoc.isValid() && Modifier == OMPC_GRAINSIZE_unknown) { @@ -23088,23 +23187,23 @@ OMPClause *Sema::ActOnOpenMPGrainsizeClause( // OpenMP [2.9.2, taskloop Constrcut] // The parameter of the grainsize clause must be a positive integer // expression. - if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_grainsize, + if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_grainsize, /*StrictlyPositive=*/true, /*BuildCapture=*/true, DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt)) return nullptr; - return new (Context) + return new (getASTContext()) OMPGrainsizeClause(Modifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, ModifierLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPNumTasksClause( +OMPClause *SemaOpenMP::ActOnOpenMPNumTasksClause( OpenMPNumTasksClauseModifier Modifier, Expr *NumTasks, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ModifierLoc, SourceLocation EndLoc) { - assert((ModifierLoc.isInvalid() || LangOpts.OpenMP >= 51) && + assert((ModifierLoc.isInvalid() || getLangOpts().OpenMP >= 51) && "Unexpected num_tasks modifier in OpenMP < 51."); if (ModifierLoc.isValid() && Modifier == OMPC_NUMTASKS_unknown) { @@ -23123,19 +23222,20 @@ OMPClause *Sema::ActOnOpenMPNumTasksClause( // The parameter of the num_tasks clause must be a positive integer // expression. if (!isNonNegativeIntegerValue( - ValExpr, *this, OMPC_num_tasks, + ValExpr, SemaRef, OMPC_num_tasks, /*StrictlyPositive=*/true, /*BuildCapture=*/true, DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt)) return nullptr; - return new (Context) + return new (getASTContext()) OMPNumTasksClause(Modifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, ModifierLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPHintClause(Expr *Hint, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPHintClause(Expr *Hint, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { // OpenMP [2.13.2, critical construct, Description] // ... where hint-expression is an integer constant expression that evaluates // to a valid lock hint. @@ -23143,7 +23243,7 @@ OMPClause *Sema::ActOnOpenMPHintClause(Expr *Hint, SourceLocation StartLoc, VerifyPositiveIntegerConstantInClause(Hint, OMPC_hint, false); if (HintExpr.isInvalid()) return nullptr; - return new (Context) + return new (getASTContext()) OMPHintClause(HintExpr.get(), StartLoc, LParenLoc, EndLoc); } @@ -23163,13 +23263,14 @@ static bool findOMPEventHandleT(Sema &S, SourceLocation Loc, return true; } -OMPClause *Sema::ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPDetachClause(Expr *Evt, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (!Evt->isValueDependent() && !Evt->isTypeDependent() && !Evt->isInstantiationDependent() && !Evt->containsUnexpandedParameterPack()) { - if (!findOMPEventHandleT(*this, Evt->getExprLoc(), DSAStack)) + if (!findOMPEventHandleT(SemaRef, Evt->getExprLoc(), DSAStack)) return nullptr; // OpenMP 5.0, 2.10.1 task Construct. // event-handle is a variable of the omp_event_handle_t type. @@ -23185,9 +23286,9 @@ OMPClause *Sema::ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc, << "omp_event_handle_t" << 0 << Evt->getSourceRange(); return nullptr; } - if (!Context.hasSameUnqualifiedType(DSAStack->getOMPEventHandleT(), - VD->getType()) || - VD->getType().isConstant(Context)) { + if (!getASTContext().hasSameUnqualifiedType(DSAStack->getOMPEventHandleT(), + VD->getType()) || + VD->getType().isConstant(getASTContext())) { Diag(Evt->getExprLoc(), diag::err_omp_var_expected) << "omp_event_handle_t" << 1 << VD->getType() << Evt->getSourceRange(); @@ -23202,15 +23303,16 @@ OMPClause *Sema::ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc, Diag(Evt->getExprLoc(), diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind) << getOpenMPClauseName(OMPC_firstprivate); - reportOriginalDsa(*this, DSAStack, VD, DVar); + reportOriginalDsa(SemaRef, DSAStack, VD, DVar); return nullptr; } } - return new (Context) OMPDetachClause(Evt, StartLoc, LParenLoc, EndLoc); + return new (getASTContext()) + OMPDetachClause(Evt, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPDistScheduleClause( +OMPClause *SemaOpenMP::ActOnOpenMPDistScheduleClause( OpenMPDistScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc) { @@ -23241,7 +23343,7 @@ OMPClause *Sema::ActOnOpenMPDistScheduleClause( // chunk_size must be a loop invariant integer expression with a positive // value. if (std::optional Result = - ValExpr->getIntegerConstantExpr(Context)) { + ValExpr->getIntegerConstantExpr(getASTContext())) { if (Result->isSigned() && !Result->isStrictlyPositive()) { Diag(ChunkSizeLoc, diag::err_omp_negative_expression_in_clause) << "dist_schedule" << ChunkSize->getSourceRange(); @@ -23249,22 +23351,22 @@ OMPClause *Sema::ActOnOpenMPDistScheduleClause( } } else if (getOpenMPCaptureRegionForClause( DSAStack->getCurrentDirective(), OMPC_dist_schedule, - LangOpts.OpenMP) != OMPD_unknown && - !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + getLangOpts().OpenMP) != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } } } - return new (Context) + return new (getASTContext()) OMPDistScheduleClause(StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc, Kind, ValExpr, HelperValStmt); } -OMPClause *Sema::ActOnOpenMPDefaultmapClause( +OMPClause *SemaOpenMP::ActOnOpenMPDefaultmapClause( OpenMPDefaultmapClauseModifier M, OpenMPDefaultmapClauseKind Kind, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc, SourceLocation KindLoc, SourceLocation EndLoc) { @@ -23291,10 +23393,10 @@ OMPClause *Sema::ActOnOpenMPDefaultmapClause( } else { bool isDefaultmapModifier = (M != OMPC_DEFAULTMAP_MODIFIER_unknown); bool isDefaultmapKind = (Kind != OMPC_DEFAULTMAP_unknown) || - (LangOpts.OpenMP >= 50 && KindLoc.isInvalid()); + (getLangOpts().OpenMP >= 50 && KindLoc.isInvalid()); if (!isDefaultmapKind || !isDefaultmapModifier) { StringRef KindValue = "'scalar', 'aggregate', 'pointer'"; - if (LangOpts.OpenMP == 50) { + if (getLangOpts().OpenMP == 50) { StringRef ModifierValue = "'alloc', 'from', 'to', 'tofrom', " "'firstprivate', 'none', 'default'"; if (!isDefaultmapKind && isDefaultmapModifier) { @@ -23346,13 +23448,13 @@ OMPClause *Sema::ActOnOpenMPDefaultmapClause( DSAStack->setDefaultDMAAttr(M, Kind, StartLoc); } - return new (Context) + return new (getASTContext()) OMPDefaultmapClause(StartLoc, LParenLoc, MLoc, KindLoc, EndLoc, Kind, M); } -bool Sema::ActOnStartOpenMPDeclareTargetContext( +bool SemaOpenMP::ActOnStartOpenMPDeclareTargetContext( DeclareTargetContextInfo &DTCI) { - DeclContext *CurLexicalContext = getCurLexicalContext(); + DeclContext *CurLexicalContext = SemaRef.getCurLexicalContext(); if (!CurLexicalContext->isFileContext() && !CurLexicalContext->isExternCContext() && !CurLexicalContext->isExternCXXContext() && @@ -23372,20 +23474,20 @@ bool Sema::ActOnStartOpenMPDeclareTargetContext( return true; } -const Sema::DeclareTargetContextInfo -Sema::ActOnOpenMPEndDeclareTargetDirective() { +const SemaOpenMP::DeclareTargetContextInfo +SemaOpenMP::ActOnOpenMPEndDeclareTargetDirective() { assert(!DeclareTargetNesting.empty() && "check isInOpenMPDeclareTargetContext() first!"); return DeclareTargetNesting.pop_back_val(); } -void Sema::ActOnFinishedOpenMPDeclareTargetContext( +void SemaOpenMP::ActOnFinishedOpenMPDeclareTargetContext( DeclareTargetContextInfo &DTCI) { for (auto &It : DTCI.ExplicitlyMapped) ActOnOpenMPDeclareTargetName(It.first, It.second.Loc, It.second.MT, DTCI); } -void Sema::DiagnoseUnterminatedOpenMPDeclareTarget() { +void SemaOpenMP::DiagnoseUnterminatedOpenMPDeclareTarget() { if (DeclareTargetNesting.empty()) return; DeclareTargetContextInfo &DTCI = DeclareTargetNesting.back(); @@ -23393,23 +23495,23 @@ void Sema::DiagnoseUnterminatedOpenMPDeclareTarget() { << getOpenMPDirectiveName(DTCI.Kind); } -NamedDecl *Sema::lookupOpenMPDeclareTargetName(Scope *CurScope, - CXXScopeSpec &ScopeSpec, - const DeclarationNameInfo &Id) { - LookupResult Lookup(*this, Id, LookupOrdinaryName); - LookupParsedName(Lookup, CurScope, &ScopeSpec, true); +NamedDecl *SemaOpenMP::lookupOpenMPDeclareTargetName( + Scope *CurScope, CXXScopeSpec &ScopeSpec, const DeclarationNameInfo &Id) { + LookupResult Lookup(SemaRef, Id, Sema::LookupOrdinaryName); + SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec, true); if (Lookup.isAmbiguous()) return nullptr; Lookup.suppressDiagnostics(); if (!Lookup.isSingleResult()) { - VarOrFuncDeclFilterCCC CCC(*this); + VarOrFuncDeclFilterCCC CCC(SemaRef); if (TypoCorrection Corrected = - CorrectTypo(Id, LookupOrdinaryName, CurScope, nullptr, CCC, - CTK_ErrorRecovery)) { - diagnoseTypo(Corrected, PDiag(diag::err_undeclared_var_use_suggest) - << Id.getName()); + SemaRef.CorrectTypo(Id, Sema::LookupOrdinaryName, CurScope, nullptr, + CCC, Sema::CTK_ErrorRecovery)) { + SemaRef.diagnoseTypo(Corrected, + SemaRef.PDiag(diag::err_undeclared_var_use_suggest) + << Id.getName()); checkDeclIsAllowedInOpenMPTarget(nullptr, Corrected.getCorrectionDecl()); return nullptr; } @@ -23427,9 +23529,9 @@ NamedDecl *Sema::lookupOpenMPDeclareTargetName(Scope *CurScope, return ND; } -void Sema::ActOnOpenMPDeclareTargetName(NamedDecl *ND, SourceLocation Loc, - OMPDeclareTargetDeclAttr::MapTypeTy MT, - DeclareTargetContextInfo &DTCI) { +void SemaOpenMP::ActOnOpenMPDeclareTargetName( + NamedDecl *ND, SourceLocation Loc, OMPDeclareTargetDeclAttr::MapTypeTy MT, + DeclareTargetContextInfo &DTCI) { assert((isa(ND) || isa(ND) || isa(ND)) && "Expected variable, function or function template."); @@ -23445,7 +23547,7 @@ void Sema::ActOnOpenMPDeclareTargetName(NamedDecl *ND, SourceLocation Loc, } // Diagnose marking after use as it may lead to incorrect diagnosis and // codegen. - if (LangOpts.OpenMP >= 50 && + if (getLangOpts().OpenMP >= 50 && (ND->isUsed(/*CheckUsedAttr=*/false) || ND->isReferenced())) Diag(Loc, diag::warn_omp_declare_target_after_first_use); @@ -23484,14 +23586,14 @@ void Sema::ActOnOpenMPDeclareTargetName(NamedDecl *ND, SourceLocation Loc, IsIndirect = true; } auto *A = OMPDeclareTargetDeclAttr::CreateImplicit( - Context, MT, DTCI.DT, IndirectE, IsIndirect, Level, + getASTContext(), MT, DTCI.DT, IndirectE, IsIndirect, Level, SourceRange(Loc, Loc)); ND->addAttr(A); - if (ASTMutationListener *ML = Context.getASTMutationListener()) + if (ASTMutationListener *ML = getASTContext().getASTMutationListener()) ML->DeclarationMarkedOpenMPDeclareTarget(ND, A); checkDeclIsAllowedInOpenMPTarget(nullptr, ND, Loc); if (auto *VD = dyn_cast(ND); - LangOpts.OpenMP && VD && VD->hasAttr() && + getLangOpts().OpenMP && VD && VD->hasAttr() && VD->hasGlobalStorage()) ActOnOpenMPDeclareTargetInitializer(ND); } @@ -23535,8 +23637,8 @@ static bool checkValueDeclInTarget(SourceLocation SL, SourceRange SR, /*FullCheck=*/false); } -void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, - SourceLocation IdLoc) { +void SemaOpenMP::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, + SourceLocation IdLoc) { if (!D || D->isInvalidDecl()) return; SourceRange SR = E ? E->getSourceRange() : D->getSourceRange(); @@ -23550,7 +23652,7 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, // directive. if (DSAStack->isThreadPrivate(VD)) { Diag(SL, diag::err_omp_threadprivate_in_target); - reportOriginalDsa(*this, DSAStack, VD, DSAStack->getTopDSA(VD, false)); + reportOriginalDsa(SemaRef, DSAStack, VD, DSAStack->getTopDSA(VD, false)); return; } } @@ -23569,7 +23671,7 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, // Problem if any with var declared with incomplete type will be reported // as normal, so no need to check it here. if ((E || !VD->getType()->isIncompleteType()) && - !checkValueDeclInTarget(SL, SR, *this, DSAStack, VD)) + !checkValueDeclInTarget(SL, SR, SemaRef, DSAStack, VD)) return; if (!E && isInOpenMPDeclareTargetContext()) { // Checking declaration inside declare target region. @@ -23589,13 +23691,13 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, IsIndirect = true; } auto *A = OMPDeclareTargetDeclAttr::CreateImplicit( - Context, + getASTContext(), getLangOpts().OpenMP >= 52 ? OMPDeclareTargetDeclAttr::MT_Enter : OMPDeclareTargetDeclAttr::MT_To, DTCI.DT, IndirectE, IsIndirect, Level, SourceRange(DTCI.Loc, DTCI.Loc)); D->addAttr(A); - if (ASTMutationListener *ML = Context.getASTMutationListener()) + if (ASTMutationListener *ML = getASTContext().getASTMutationListener()) ML->DeclarationMarkedOpenMPDeclareTarget(D, A); } return; @@ -23603,7 +23705,7 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, } if (!E) return; - checkDeclInTargetContext(E->getExprLoc(), E->getSourceRange(), *this, D); + checkDeclInTargetContext(E->getExprLoc(), E->getSourceRange(), SemaRef, D); } /// This class visits every VarDecl that the initializer references and adds @@ -23649,13 +23751,13 @@ class GlobalDeclRefChecker final /// Adding OMPDeclareTargetDeclAttr to variables with static storage /// duration that are referenced in the initializer expression list of /// variables with static storage duration in declare target directive. -void Sema::ActOnOpenMPDeclareTargetInitializer(Decl *TargetDecl) { +void SemaOpenMP::ActOnOpenMPDeclareTargetInitializer(Decl *TargetDecl) { GlobalDeclRefChecker Checker; if (isa(TargetDecl)) Checker.declareTargetInitializer(TargetDecl); } -OMPClause *Sema::ActOnOpenMPToClause( +OMPClause *SemaOpenMP::ActOnOpenMPToClause( ArrayRef MotionModifiers, ArrayRef MotionModifiersLoc, CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, @@ -23681,18 +23783,18 @@ OMPClause *Sema::ActOnOpenMPToClause( } MappableVarListInfo MVLI(VarList); - checkMappableExpressionList(*this, DSAStack, OMPC_to, MVLI, Locs.StartLoc, + checkMappableExpressionList(SemaRef, DSAStack, OMPC_to, MVLI, Locs.StartLoc, MapperIdScopeSpec, MapperId, UnresolvedMappers); if (MVLI.ProcessedVarList.empty()) return nullptr; return OMPToClause::Create( - Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, + getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc, - MapperIdScopeSpec.getWithLocInContext(Context), MapperId); + MapperIdScopeSpec.getWithLocInContext(getASTContext()), MapperId); } -OMPClause *Sema::ActOnOpenMPFromClause( +OMPClause *SemaOpenMP::ActOnOpenMPFromClause( ArrayRef MotionModifiers, ArrayRef MotionModifiersLoc, CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId, @@ -23718,19 +23820,20 @@ OMPClause *Sema::ActOnOpenMPFromClause( } MappableVarListInfo MVLI(VarList); - checkMappableExpressionList(*this, DSAStack, OMPC_from, MVLI, Locs.StartLoc, + checkMappableExpressionList(SemaRef, DSAStack, OMPC_from, MVLI, Locs.StartLoc, MapperIdScopeSpec, MapperId, UnresolvedMappers); if (MVLI.ProcessedVarList.empty()) return nullptr; return OMPFromClause::Create( - Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, + getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc, - MapperIdScopeSpec.getWithLocInContext(Context), MapperId); + MapperIdScopeSpec.getWithLocInContext(getASTContext()), MapperId); } -OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef VarList, - const OMPVarListLocTy &Locs) { +OMPClause * +SemaOpenMP::ActOnOpenMPUseDevicePtrClause(ArrayRef VarList, + const OMPVarListLocTy &Locs) { MappableVarListInfo MVLI(VarList); SmallVector PrivateCopies; SmallVector Inits; @@ -23740,7 +23843,7 @@ OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef VarList, SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) { // It will be analyzed later. MVLI.ProcessedVarList.push_back(RefExpr); @@ -23765,30 +23868,30 @@ OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef VarList, // Build the private variable and the expression that refers to it. auto VDPrivate = - buildVarDecl(*this, ELoc, Type, D->getName(), + buildVarDecl(SemaRef, ELoc, Type, D->getName(), D->hasAttrs() ? &D->getAttrs() : nullptr, VD ? cast(SimpleRefExpr) : nullptr); if (VDPrivate->isInvalidDecl()) continue; - CurContext->addDecl(VDPrivate); + SemaRef.CurContext->addDecl(VDPrivate); DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr( - *this, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc); + SemaRef, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc); // Add temporary variable to initialize the private copy of the pointer. VarDecl *VDInit = - buildVarDecl(*this, RefExpr->getExprLoc(), Type, ".devptr.temp"); + buildVarDecl(SemaRef, RefExpr->getExprLoc(), Type, ".devptr.temp"); DeclRefExpr *VDInitRefExpr = buildDeclRefExpr( - *this, VDInit, RefExpr->getType(), RefExpr->getExprLoc()); - AddInitializerToDecl(VDPrivate, - DefaultLvalueConversion(VDInitRefExpr).get(), - /*DirectInit=*/false); + SemaRef, VDInit, RefExpr->getType(), RefExpr->getExprLoc()); + SemaRef.AddInitializerToDecl( + VDPrivate, SemaRef.DefaultLvalueConversion(VDInitRefExpr).get(), + /*DirectInit=*/false); // If required, build a capture to implement the privatization initialized // with the current list item value. DeclRefExpr *Ref = nullptr; if (!VD) - Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true); + Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true); MVLI.ProcessedVarList.push_back(VD ? RefExpr->IgnoreParens() : Ref); PrivateCopies.push_back(VDPrivateRefExpr); Inits.push_back(VDInitRefExpr); @@ -23810,12 +23913,13 @@ OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef VarList, return nullptr; return OMPUseDevicePtrClause::Create( - Context, Locs, MVLI.ProcessedVarList, PrivateCopies, Inits, + getASTContext(), Locs, MVLI.ProcessedVarList, PrivateCopies, Inits, MVLI.VarBaseDeclarations, MVLI.VarComponents); } -OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef VarList, - const OMPVarListLocTy &Locs) { +OMPClause * +SemaOpenMP::ActOnOpenMPUseDeviceAddrClause(ArrayRef VarList, + const OMPVarListLocTy &Locs) { MappableVarListInfo MVLI(VarList); for (Expr *RefExpr : VarList) { @@ -23823,7 +23927,7 @@ OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef VarList, SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange, + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange, /*AllowArraySection=*/true); if (Res.second) { // It will be analyzed later. @@ -23838,7 +23942,7 @@ OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef VarList, // with the current list item value. DeclRefExpr *Ref = nullptr; if (!VD) - Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true); + Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true); MVLI.ProcessedVarList.push_back(VD ? RefExpr->IgnoreParens() : Ref); // We need to add a data sharing attribute for this variable to make sure it @@ -23853,7 +23957,8 @@ OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef VarList, Expr *Component = SimpleRefExpr; if (VD && (isa(RefExpr->IgnoreParenImpCasts()) || isa(RefExpr->IgnoreParenImpCasts()))) - Component = DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get(); + Component = + SemaRef.DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get(); MVLI.VarComponents.back().emplace_back(Component, D, /*IsNonContiguous=*/false); } @@ -23861,20 +23966,21 @@ OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef VarList, if (MVLI.ProcessedVarList.empty()) return nullptr; - return OMPUseDeviceAddrClause::Create(Context, Locs, MVLI.ProcessedVarList, - MVLI.VarBaseDeclarations, - MVLI.VarComponents); + return OMPUseDeviceAddrClause::Create( + getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, + MVLI.VarComponents); } -OMPClause *Sema::ActOnOpenMPIsDevicePtrClause(ArrayRef VarList, - const OMPVarListLocTy &Locs) { +OMPClause * +SemaOpenMP::ActOnOpenMPIsDevicePtrClause(ArrayRef VarList, + const OMPVarListLocTy &Locs) { MappableVarListInfo MVLI(VarList); for (Expr *RefExpr : VarList) { assert(RefExpr && "NULL expr in OpenMP is_device_ptr clause."); SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) { // It will be analyzed later. MVLI.ProcessedVarList.push_back(RefExpr); @@ -23900,7 +24006,7 @@ OMPClause *Sema::ActOnOpenMPIsDevicePtrClause(ArrayRef VarList, << getOpenMPClauseName(DVar.CKind) << getOpenMPClauseName(OMPC_is_device_ptr) << getOpenMPDirectiveName(DSAStack->getCurrentDirective()); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } @@ -23944,20 +24050,21 @@ OMPClause *Sema::ActOnOpenMPIsDevicePtrClause(ArrayRef VarList, if (MVLI.ProcessedVarList.empty()) return nullptr; - return OMPIsDevicePtrClause::Create(Context, Locs, MVLI.ProcessedVarList, - MVLI.VarBaseDeclarations, - MVLI.VarComponents); + return OMPIsDevicePtrClause::Create( + getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, + MVLI.VarComponents); } -OMPClause *Sema::ActOnOpenMPHasDeviceAddrClause(ArrayRef VarList, - const OMPVarListLocTy &Locs) { +OMPClause * +SemaOpenMP::ActOnOpenMPHasDeviceAddrClause(ArrayRef VarList, + const OMPVarListLocTy &Locs) { MappableVarListInfo MVLI(VarList); for (Expr *RefExpr : VarList) { assert(RefExpr && "NULL expr in OpenMP has_device_addr clause."); SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange, + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange, /*AllowArraySection=*/true); if (Res.second) { // It will be analyzed later. @@ -23975,7 +24082,7 @@ OMPClause *Sema::ActOnOpenMPHasDeviceAddrClause(ArrayRef VarList, << getOpenMPClauseName(DVar.CKind) << getOpenMPClauseName(OMPC_has_device_addr) << getOpenMPDirectiveName(DSAStack->getCurrentDirective()); - reportOriginalDsa(*this, DSAStack, D, DVar); + reportOriginalDsa(SemaRef, DSAStack, D, DVar); continue; } @@ -24000,16 +24107,17 @@ OMPClause *Sema::ActOnOpenMPHasDeviceAddrClause(ArrayRef VarList, auto *VD = dyn_cast(D); if (VD && (isa(RefExpr->IgnoreParenImpCasts()) || isa(RefExpr->IgnoreParenImpCasts()))) - Component = DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get(); + Component = + SemaRef.DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get(); OMPClauseMappableExprCommon::MappableComponent MC( Component, D, /*IsNonContiguous=*/false); DSAStack->addMappableExpressionComponents( D, MC, /*WhereFoundClauseKind=*/OMPC_has_device_addr); // Record the expression we've just processed. - if (!VD && !CurContext->isDependentContext()) { + if (!VD && !SemaRef.CurContext->isDependentContext()) { DeclRefExpr *Ref = - buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true); + buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true); assert(Ref && "has_device_addr capture failed"); MVLI.ProcessedVarList.push_back(Ref); } else @@ -24030,27 +24138,27 @@ OMPClause *Sema::ActOnOpenMPHasDeviceAddrClause(ArrayRef VarList, if (MVLI.ProcessedVarList.empty()) return nullptr; - return OMPHasDeviceAddrClause::Create(Context, Locs, MVLI.ProcessedVarList, - MVLI.VarBaseDeclarations, - MVLI.VarComponents); + return OMPHasDeviceAddrClause::Create( + getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations, + MVLI.VarComponents); } -OMPClause *Sema::ActOnOpenMPAllocateClause( +OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause( Expr *Allocator, ArrayRef VarList, SourceLocation StartLoc, SourceLocation ColonLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { if (Allocator) { // OpenMP [2.11.4 allocate Clause, Description] // allocator is an expression of omp_allocator_handle_t type. - if (!findOMPAllocatorHandleT(*this, Allocator->getExprLoc(), DSAStack)) + if (!findOMPAllocatorHandleT(SemaRef, Allocator->getExprLoc(), DSAStack)) return nullptr; - ExprResult AllocatorRes = DefaultLvalueConversion(Allocator); + ExprResult AllocatorRes = SemaRef.DefaultLvalueConversion(Allocator); if (AllocatorRes.isInvalid()) return nullptr; - AllocatorRes = PerformImplicitConversion(AllocatorRes.get(), - DSAStack->getOMPAllocatorHandleT(), - Sema::AA_Initializing, - /*AllowExplicit=*/true); + AllocatorRes = SemaRef.PerformImplicitConversion( + AllocatorRes.get(), DSAStack->getOMPAllocatorHandleT(), + Sema::AA_Initializing, + /*AllowExplicit=*/true); if (AllocatorRes.isInvalid()) return nullptr; Allocator = AllocatorRes.get(); @@ -24060,9 +24168,9 @@ OMPClause *Sema::ActOnOpenMPAllocateClause( // target region must specify an allocator expression unless a requires // directive with the dynamic_allocators clause is present in the same // compilation unit. - if (LangOpts.OpenMPIsTargetDevice && + if (getLangOpts().OpenMPIsTargetDevice && !DSAStack->hasRequiresDeclWithClause()) - targetDiag(StartLoc, diag::err_expected_allocator_expression); + SemaRef.targetDiag(StartLoc, diag::err_expected_allocator_expression); } // Analyze and build list of variables. SmallVector Vars; @@ -24071,7 +24179,7 @@ OMPClause *Sema::ActOnOpenMPAllocateClause( SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) { // It will be analyzed later. Vars.push_back(RefExpr); @@ -24082,9 +24190,9 @@ OMPClause *Sema::ActOnOpenMPAllocateClause( auto *VD = dyn_cast(D); DeclRefExpr *Ref = nullptr; - if (!VD && !CurContext->isDependentContext()) - Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false); - Vars.push_back((VD || CurContext->isDependentContext()) + if (!VD && !SemaRef.CurContext->isDependentContext()) + Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/false); + Vars.push_back((VD || SemaRef.CurContext->isDependentContext()) ? RefExpr->IgnoreParens() : Ref); } @@ -24094,21 +24202,21 @@ OMPClause *Sema::ActOnOpenMPAllocateClause( if (Allocator) DSAStack->addInnerAllocatorExpr(Allocator); - return OMPAllocateClause::Create(Context, StartLoc, LParenLoc, Allocator, - ColonLoc, EndLoc, Vars); + return OMPAllocateClause::Create(getASTContext(), StartLoc, LParenLoc, + Allocator, ColonLoc, EndLoc, Vars); } -OMPClause *Sema::ActOnOpenMPNontemporalClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPNontemporalClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { SmallVector Vars; for (Expr *RefExpr : VarList) { assert(RefExpr && "NULL expr in OpenMP nontemporal clause."); SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange); + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange); if (Res.second) // It will be analyzed later. Vars.push_back(RefExpr); @@ -24133,32 +24241,34 @@ OMPClause *Sema::ActOnOpenMPNontemporalClause(ArrayRef VarList, if (Vars.empty()) return nullptr; - return OMPNontemporalClause::Create(Context, StartLoc, LParenLoc, EndLoc, - Vars); + return OMPNontemporalClause::Create(getASTContext(), StartLoc, LParenLoc, + EndLoc, Vars); } -StmtResult Sema::ActOnOpenMPScopeDirective(ArrayRef Clauses, - Stmt *AStmt, SourceLocation StartLoc, - SourceLocation EndLoc) { +StmtResult SemaOpenMP::ActOnOpenMPScopeDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { if (!AStmt) return StmtError(); - setFunctionHasBranchProtectedScope(); + SemaRef.setFunctionHasBranchProtectedScope(); - return OMPScopeDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt); + return OMPScopeDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses, + AStmt); } -OMPClause *Sema::ActOnOpenMPInclusiveClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPInclusiveClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { SmallVector Vars; for (Expr *RefExpr : VarList) { assert(RefExpr && "NULL expr in OpenMP nontemporal clause."); SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange, + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange, /*AllowArraySection=*/true); if (Res.second) // It will be analyzed later. @@ -24185,20 +24295,21 @@ OMPClause *Sema::ActOnOpenMPInclusiveClause(ArrayRef VarList, if (Vars.empty()) return nullptr; - return OMPInclusiveClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars); + return OMPInclusiveClause::Create(getASTContext(), StartLoc, LParenLoc, + EndLoc, Vars); } -OMPClause *Sema::ActOnOpenMPExclusiveClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPExclusiveClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { SmallVector Vars; for (Expr *RefExpr : VarList) { assert(RefExpr && "NULL expr in OpenMP nontemporal clause."); SourceLocation ELoc; SourceRange ERange; Expr *SimpleRefExpr = RefExpr; - auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange, + auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange, /*AllowArraySection=*/true); if (Res.second) // It will be analyzed later. @@ -24228,7 +24339,8 @@ OMPClause *Sema::ActOnOpenMPExclusiveClause(ArrayRef VarList, if (Vars.empty()) return nullptr; - return OMPExclusiveClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars); + return OMPExclusiveClause::Create(getASTContext(), StartLoc, LParenLoc, + EndLoc, Vars); } /// Tries to find omp_alloctrait_t type. @@ -24246,19 +24358,20 @@ static bool findOMPAlloctraitT(Sema &S, SourceLocation Loc, DSAStackTy *Stack) { return true; } -OMPClause *Sema::ActOnOpenMPUsesAllocatorClause( +OMPClause *SemaOpenMP::ActOnOpenMPUsesAllocatorClause( SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc, ArrayRef Data) { + ASTContext &Context = getASTContext(); // OpenMP [2.12.5, target Construct] // allocator is an identifier of omp_allocator_handle_t type. - if (!findOMPAllocatorHandleT(*this, StartLoc, DSAStack)) + if (!findOMPAllocatorHandleT(SemaRef, StartLoc, DSAStack)) return nullptr; // OpenMP [2.12.5, target Construct] // allocator-traits-array is an identifier of const omp_alloctrait_t * type. if (llvm::any_of( Data, [](const UsesAllocatorsData &D) { return D.AllocatorTraits; }) && - !findOMPAlloctraitT(*this, StartLoc, DSAStack)) + !findOMPAlloctraitT(SemaRef, StartLoc, DSAStack)) return nullptr; llvm::SmallPtrSet, 4> PredefinedAllocators; for (int I = 0; I < OMPAllocateDeclAttr::OMPUserDefinedMemAlloc; ++I) { @@ -24266,8 +24379,8 @@ OMPClause *Sema::ActOnOpenMPUsesAllocatorClause( StringRef Allocator = OMPAllocateDeclAttr::ConvertAllocatorTypeTyToStr(AllocatorKind); DeclarationName AllocatorName = &Context.Idents.get(Allocator); - PredefinedAllocators.insert(LookupSingleName( - TUScope, AllocatorName, StartLoc, Sema::LookupAnyName)); + PredefinedAllocators.insert(SemaRef.LookupSingleName( + SemaRef.TUScope, AllocatorName, StartLoc, Sema::LookupAnyName)); } SmallVector NewData; @@ -24284,7 +24397,7 @@ OMPClause *Sema::ActOnOpenMPUsesAllocatorClause( bool IsPredefinedAllocator = false; if (DRE) { OMPAllocateDeclAttr::AllocatorTypeTy AllocatorTy = - getAllocatorKind(*this, DSAStack, AllocatorExpr); + getAllocatorKind(SemaRef, DSAStack, AllocatorExpr); IsPredefinedAllocator = AllocatorTy != OMPAllocateDeclAttr::AllocatorTypeTy::OMPUserDefinedMemAlloc; @@ -24329,7 +24442,7 @@ OMPClause *Sema::ActOnOpenMPUsesAllocatorClause( } // No allocator traits - just convert it to rvalue. if (!D.AllocatorTraits) - AllocatorExpr = DefaultLvalueConversion(AllocatorExpr).get(); + AllocatorExpr = SemaRef.DefaultLvalueConversion(AllocatorExpr).get(); DSAStack->addUsesAllocatorsDecl( DRE->getDecl(), IsPredefinedAllocator @@ -24376,11 +24489,11 @@ OMPClause *Sema::ActOnOpenMPUsesAllocatorClause( NewD.LParenLoc = D.LParenLoc; NewD.RParenLoc = D.RParenLoc; } - return OMPUsesAllocatorsClause::Create(Context, StartLoc, LParenLoc, EndLoc, - NewData); + return OMPUsesAllocatorsClause::Create(getASTContext(), StartLoc, LParenLoc, + EndLoc, NewData); } -OMPClause *Sema::ActOnOpenMPAffinityClause( +OMPClause *SemaOpenMP::ActOnOpenMPAffinityClause( SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc, Expr *Modifier, ArrayRef Locators) { SmallVector Vars; @@ -24403,8 +24516,8 @@ OMPClause *Sema::ActOnOpenMPAffinityClause( ExprResult Res; { - Sema::TentativeAnalysisScope Trap(*this); - Res = CreateBuiltinUnaryOp(ELoc, UO_AddrOf, SimpleExpr); + Sema::TentativeAnalysisScope Trap(SemaRef); + Res = SemaRef.CreateBuiltinUnaryOp(ELoc, UO_AddrOf, SimpleExpr); } if (!Res.isUsable() && !isa(SimpleExpr) && !isa(SimpleExpr)) { @@ -24415,15 +24528,15 @@ OMPClause *Sema::ActOnOpenMPAffinityClause( Vars.push_back(SimpleExpr); } - return OMPAffinityClause::Create(Context, StartLoc, LParenLoc, ColonLoc, - EndLoc, Modifier, Vars); + return OMPAffinityClause::Create(getASTContext(), StartLoc, LParenLoc, + ColonLoc, EndLoc, Modifier, Vars); } -OMPClause *Sema::ActOnOpenMPBindClause(OpenMPBindClauseKind Kind, - SourceLocation KindLoc, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPBindClause(OpenMPBindClauseKind Kind, + SourceLocation KindLoc, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { if (Kind == OMPC_BIND_unknown) { Diag(KindLoc, diag::err_omp_unexpected_clause_value) << getListOfPossibleValues(OMPC_bind, /*First=*/0, @@ -24432,39 +24545,40 @@ OMPClause *Sema::ActOnOpenMPBindClause(OpenMPBindClauseKind Kind, return nullptr; } - return OMPBindClause::Create(Context, Kind, KindLoc, StartLoc, LParenLoc, - EndLoc); + return OMPBindClause::Create(getASTContext(), Kind, KindLoc, StartLoc, + LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPXDynCGroupMemClause(Expr *Size, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { +OMPClause *SemaOpenMP::ActOnOpenMPXDynCGroupMemClause(Expr *Size, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { Expr *ValExpr = Size; Stmt *HelperValStmt = nullptr; // OpenMP [2.5, Restrictions] // The ompx_dyn_cgroup_mem expression must evaluate to a positive integer // value. - if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_ompx_dyn_cgroup_mem, + if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_ompx_dyn_cgroup_mem, /*StrictlyPositive=*/false)) return nullptr; OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective(); OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause( - DKind, OMPC_ompx_dyn_cgroup_mem, LangOpts.OpenMP); - if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) { - ValExpr = MakeFullExpr(ValExpr).get(); + DKind, OMPC_ompx_dyn_cgroup_mem, getLangOpts().OpenMP); + if (CaptureRegion != OMPD_unknown && + !SemaRef.CurContext->isDependentContext()) { + ValExpr = SemaRef.MakeFullExpr(ValExpr).get(); llvm::MapVector Captures; - ValExpr = tryBuildCapture(*this, ValExpr, Captures).get(); - HelperValStmt = buildPreInits(Context, Captures); + ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get(); + HelperValStmt = buildPreInits(getASTContext(), Captures); } - return new (Context) OMPXDynCGroupMemClause( + return new (getASTContext()) OMPXDynCGroupMemClause( ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPDoacrossClause( +OMPClause *SemaOpenMP::ActOnOpenMPDoacrossClause( OpenMPDoacrossClauseModifier DepType, SourceLocation DepLoc, SourceLocation ColonLoc, ArrayRef VarList, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { @@ -24483,7 +24597,7 @@ OMPClause *Sema::ActOnOpenMPDoacrossClause( DSAStackTy::OperatorOffsetTy OpsOffs; llvm::APSInt TotalDepCount(/*BitWidth=*/32); DoacrossDataInfoTy VarOffset = ProcessOpenMPDoacrossClauseCommon( - *this, + SemaRef, DepType == OMPC_DOACROSS_source || DepType == OMPC_DOACROSS_source_omp_cur_iteration || DepType == OMPC_DOACROSS_sink_omp_cur_iteration, @@ -24491,22 +24605,587 @@ OMPClause *Sema::ActOnOpenMPDoacrossClause( Vars = VarOffset.Vars; OpsOffs = VarOffset.OpsOffs; TotalDepCount = VarOffset.TotalDepCount; - auto *C = OMPDoacrossClause::Create(Context, StartLoc, LParenLoc, EndLoc, - DepType, DepLoc, ColonLoc, Vars, + auto *C = OMPDoacrossClause::Create(getASTContext(), StartLoc, LParenLoc, + EndLoc, DepType, DepLoc, ColonLoc, Vars, TotalDepCount.getZExtValue()); if (DSAStack->isParentOrderedRegion()) DSAStack->addDoacrossDependClause(C, OpsOffs); return C; } -OMPClause *Sema::ActOnOpenMPXAttributeClause(ArrayRef Attrs, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation EndLoc) { - return new (Context) OMPXAttributeClause(Attrs, StartLoc, LParenLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPXAttributeClause(ArrayRef Attrs, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { + return new (getASTContext()) + OMPXAttributeClause(Attrs, StartLoc, LParenLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPXBareClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPXBareClause(StartLoc, EndLoc); +OMPClause *SemaOpenMP::ActOnOpenMPXBareClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return new (getASTContext()) OMPXBareClause(StartLoc, EndLoc); +} + +ExprResult SemaOpenMP::ActOnOMPArraySectionExpr( + Expr *Base, SourceLocation LBLoc, Expr *LowerBound, + SourceLocation ColonLocFirst, SourceLocation ColonLocSecond, Expr *Length, + Expr *Stride, SourceLocation RBLoc) { + ASTContext &Context = getASTContext(); + if (Base->hasPlaceholderType() && + !Base->hasPlaceholderType(BuiltinType::OMPArraySection)) { + ExprResult Result = SemaRef.CheckPlaceholderExpr(Base); + if (Result.isInvalid()) + return ExprError(); + Base = Result.get(); + } + if (LowerBound && LowerBound->getType()->isNonOverloadPlaceholderType()) { + ExprResult Result = SemaRef.CheckPlaceholderExpr(LowerBound); + if (Result.isInvalid()) + return ExprError(); + Result = SemaRef.DefaultLvalueConversion(Result.get()); + if (Result.isInvalid()) + return ExprError(); + LowerBound = Result.get(); + } + if (Length && Length->getType()->isNonOverloadPlaceholderType()) { + ExprResult Result = SemaRef.CheckPlaceholderExpr(Length); + if (Result.isInvalid()) + return ExprError(); + Result = SemaRef.DefaultLvalueConversion(Result.get()); + if (Result.isInvalid()) + return ExprError(); + Length = Result.get(); + } + if (Stride && Stride->getType()->isNonOverloadPlaceholderType()) { + ExprResult Result = SemaRef.CheckPlaceholderExpr(Stride); + if (Result.isInvalid()) + return ExprError(); + Result = SemaRef.DefaultLvalueConversion(Result.get()); + if (Result.isInvalid()) + return ExprError(); + Stride = Result.get(); + } + + // Build an unanalyzed expression if either operand is type-dependent. + if (Base->isTypeDependent() || + (LowerBound && + (LowerBound->isTypeDependent() || LowerBound->isValueDependent())) || + (Length && (Length->isTypeDependent() || Length->isValueDependent())) || + (Stride && (Stride->isTypeDependent() || Stride->isValueDependent()))) { + return new (Context) OMPArraySectionExpr( + Base, LowerBound, Length, Stride, Context.DependentTy, VK_LValue, + OK_Ordinary, ColonLocFirst, ColonLocSecond, RBLoc); + } + + // Perform default conversions. + QualType OriginalTy = OMPArraySectionExpr::getBaseOriginalType(Base); + QualType ResultTy; + if (OriginalTy->isAnyPointerType()) { + ResultTy = OriginalTy->getPointeeType(); + } else if (OriginalTy->isArrayType()) { + ResultTy = OriginalTy->getAsArrayTypeUnsafe()->getElementType(); + } else { + return ExprError( + Diag(Base->getExprLoc(), diag::err_omp_typecheck_section_value) + << Base->getSourceRange()); + } + // C99 6.5.2.1p1 + if (LowerBound) { + auto Res = PerformOpenMPImplicitIntegerConversion(LowerBound->getExprLoc(), + LowerBound); + if (Res.isInvalid()) + return ExprError(Diag(LowerBound->getExprLoc(), + diag::err_omp_typecheck_section_not_integer) + << 0 << LowerBound->getSourceRange()); + LowerBound = Res.get(); + + if (LowerBound->getType()->isSpecificBuiltinType(BuiltinType::Char_S) || + LowerBound->getType()->isSpecificBuiltinType(BuiltinType::Char_U)) + Diag(LowerBound->getExprLoc(), diag::warn_omp_section_is_char) + << 0 << LowerBound->getSourceRange(); + } + if (Length) { + auto Res = + PerformOpenMPImplicitIntegerConversion(Length->getExprLoc(), Length); + if (Res.isInvalid()) + return ExprError(Diag(Length->getExprLoc(), + diag::err_omp_typecheck_section_not_integer) + << 1 << Length->getSourceRange()); + Length = Res.get(); + + if (Length->getType()->isSpecificBuiltinType(BuiltinType::Char_S) || + Length->getType()->isSpecificBuiltinType(BuiltinType::Char_U)) + Diag(Length->getExprLoc(), diag::warn_omp_section_is_char) + << 1 << Length->getSourceRange(); + } + if (Stride) { + ExprResult Res = + PerformOpenMPImplicitIntegerConversion(Stride->getExprLoc(), Stride); + if (Res.isInvalid()) + return ExprError(Diag(Stride->getExprLoc(), + diag::err_omp_typecheck_section_not_integer) + << 1 << Stride->getSourceRange()); + Stride = Res.get(); + + if (Stride->getType()->isSpecificBuiltinType(BuiltinType::Char_S) || + Stride->getType()->isSpecificBuiltinType(BuiltinType::Char_U)) + Diag(Stride->getExprLoc(), diag::warn_omp_section_is_char) + << 1 << Stride->getSourceRange(); + } + + // C99 6.5.2.1p1: "shall have type "pointer to *object* type". Similarly, + // C++ [expr.sub]p1: The type "T" shall be a completely-defined object + // type. Note that functions are not objects, and that (in C99 parlance) + // incomplete types are not object types. + if (ResultTy->isFunctionType()) { + Diag(Base->getExprLoc(), diag::err_omp_section_function_type) + << ResultTy << Base->getSourceRange(); + return ExprError(); + } + + if (SemaRef.RequireCompleteType(Base->getExprLoc(), ResultTy, + diag::err_omp_section_incomplete_type, Base)) + return ExprError(); + + if (LowerBound && !OriginalTy->isAnyPointerType()) { + Expr::EvalResult Result; + if (LowerBound->EvaluateAsInt(Result, Context)) { + // OpenMP 5.0, [2.1.5 Array Sections] + // The array section must be a subset of the original array. + llvm::APSInt LowerBoundValue = Result.Val.getInt(); + if (LowerBoundValue.isNegative()) { + Diag(LowerBound->getExprLoc(), + diag::err_omp_section_not_subset_of_array) + << LowerBound->getSourceRange(); + return ExprError(); + } + } + } + + if (Length) { + Expr::EvalResult Result; + if (Length->EvaluateAsInt(Result, Context)) { + // OpenMP 5.0, [2.1.5 Array Sections] + // The length must evaluate to non-negative integers. + llvm::APSInt LengthValue = Result.Val.getInt(); + if (LengthValue.isNegative()) { + Diag(Length->getExprLoc(), diag::err_omp_section_length_negative) + << toString(LengthValue, /*Radix=*/10, /*Signed=*/true) + << Length->getSourceRange(); + return ExprError(); + } + } + } else if (ColonLocFirst.isValid() && + (OriginalTy.isNull() || (!OriginalTy->isConstantArrayType() && + !OriginalTy->isVariableArrayType()))) { + // OpenMP 5.0, [2.1.5 Array Sections] + // When the size of the array dimension is not known, the length must be + // specified explicitly. + Diag(ColonLocFirst, diag::err_omp_section_length_undefined) + << (!OriginalTy.isNull() && OriginalTy->isArrayType()); + return ExprError(); + } + + if (Stride) { + Expr::EvalResult Result; + if (Stride->EvaluateAsInt(Result, Context)) { + // OpenMP 5.0, [2.1.5 Array Sections] + // The stride must evaluate to a positive integer. + llvm::APSInt StrideValue = Result.Val.getInt(); + if (!StrideValue.isStrictlyPositive()) { + Diag(Stride->getExprLoc(), diag::err_omp_section_stride_non_positive) + << toString(StrideValue, /*Radix=*/10, /*Signed=*/true) + << Stride->getSourceRange(); + return ExprError(); + } + } + } + + if (!Base->hasPlaceholderType(BuiltinType::OMPArraySection)) { + ExprResult Result = SemaRef.DefaultFunctionArrayLvalueConversion(Base); + if (Result.isInvalid()) + return ExprError(); + Base = Result.get(); + } + return new (Context) OMPArraySectionExpr( + Base, LowerBound, Length, Stride, Context.OMPArraySectionTy, VK_LValue, + OK_Ordinary, ColonLocFirst, ColonLocSecond, RBLoc); +} + +ExprResult SemaOpenMP::ActOnOMPArrayShapingExpr( + Expr *Base, SourceLocation LParenLoc, SourceLocation RParenLoc, + ArrayRef Dims, ArrayRef Brackets) { + ASTContext &Context = getASTContext(); + if (Base->hasPlaceholderType()) { + ExprResult Result = SemaRef.CheckPlaceholderExpr(Base); + if (Result.isInvalid()) + return ExprError(); + Result = SemaRef.DefaultLvalueConversion(Result.get()); + if (Result.isInvalid()) + return ExprError(); + Base = Result.get(); + } + QualType BaseTy = Base->getType(); + // Delay analysis of the types/expressions if instantiation/specialization is + // required. + if (!BaseTy->isPointerType() && Base->isTypeDependent()) + return OMPArrayShapingExpr::Create(Context, Context.DependentTy, Base, + LParenLoc, RParenLoc, Dims, Brackets); + if (!BaseTy->isPointerType() || + (!Base->isTypeDependent() && + BaseTy->getPointeeType()->isIncompleteType())) + return ExprError(Diag(Base->getExprLoc(), + diag::err_omp_non_pointer_type_array_shaping_base) + << Base->getSourceRange()); + + SmallVector NewDims; + bool ErrorFound = false; + for (Expr *Dim : Dims) { + if (Dim->hasPlaceholderType()) { + ExprResult Result = SemaRef.CheckPlaceholderExpr(Dim); + if (Result.isInvalid()) { + ErrorFound = true; + continue; + } + Result = SemaRef.DefaultLvalueConversion(Result.get()); + if (Result.isInvalid()) { + ErrorFound = true; + continue; + } + Dim = Result.get(); + } + if (!Dim->isTypeDependent()) { + ExprResult Result = + PerformOpenMPImplicitIntegerConversion(Dim->getExprLoc(), Dim); + if (Result.isInvalid()) { + ErrorFound = true; + Diag(Dim->getExprLoc(), diag::err_omp_typecheck_shaping_not_integer) + << Dim->getSourceRange(); + continue; + } + Dim = Result.get(); + Expr::EvalResult EvResult; + if (!Dim->isValueDependent() && Dim->EvaluateAsInt(EvResult, Context)) { + // OpenMP 5.0, [2.1.4 Array Shaping] + // Each si is an integral type expression that must evaluate to a + // positive integer. + llvm::APSInt Value = EvResult.Val.getInt(); + if (!Value.isStrictlyPositive()) { + Diag(Dim->getExprLoc(), diag::err_omp_shaping_dimension_not_positive) + << toString(Value, /*Radix=*/10, /*Signed=*/true) + << Dim->getSourceRange(); + ErrorFound = true; + continue; + } + } + } + NewDims.push_back(Dim); + } + if (ErrorFound) + return ExprError(); + return OMPArrayShapingExpr::Create(Context, Context.OMPArrayShapingTy, Base, + LParenLoc, RParenLoc, NewDims, Brackets); } + +ExprResult SemaOpenMP::ActOnOMPIteratorExpr(Scope *S, + SourceLocation IteratorKwLoc, + SourceLocation LLoc, + SourceLocation RLoc, + ArrayRef Data) { + ASTContext &Context = getASTContext(); + SmallVector ID; + bool IsCorrect = true; + for (const OMPIteratorData &D : Data) { + TypeSourceInfo *TInfo = nullptr; + SourceLocation StartLoc; + QualType DeclTy; + if (!D.Type.getAsOpaquePtr()) { + // OpenMP 5.0, 2.1.6 Iterators + // In an iterator-specifier, if the iterator-type is not specified then + // the type of that iterator is of int type. + DeclTy = Context.IntTy; + StartLoc = D.DeclIdentLoc; + } else { + DeclTy = Sema::GetTypeFromParser(D.Type, &TInfo); + StartLoc = TInfo->getTypeLoc().getBeginLoc(); + } + + bool IsDeclTyDependent = DeclTy->isDependentType() || + DeclTy->containsUnexpandedParameterPack() || + DeclTy->isInstantiationDependentType(); + if (!IsDeclTyDependent) { + if (!DeclTy->isIntegralType(Context) && !DeclTy->isAnyPointerType()) { + // OpenMP 5.0, 2.1.6 Iterators, Restrictions, C/C++ + // The iterator-type must be an integral or pointer type. + Diag(StartLoc, diag::err_omp_iterator_not_integral_or_pointer) + << DeclTy; + IsCorrect = false; + continue; + } + if (DeclTy.isConstant(Context)) { + // OpenMP 5.0, 2.1.6 Iterators, Restrictions, C/C++ + // The iterator-type must not be const qualified. + Diag(StartLoc, diag::err_omp_iterator_not_integral_or_pointer) + << DeclTy; + IsCorrect = false; + continue; + } + } + + // Iterator declaration. + assert(D.DeclIdent && "Identifier expected."); + // Always try to create iterator declarator to avoid extra error messages + // about unknown declarations use. + auto *VD = + VarDecl::Create(Context, SemaRef.CurContext, StartLoc, D.DeclIdentLoc, + D.DeclIdent, DeclTy, TInfo, SC_None); + VD->setImplicit(); + if (S) { + // Check for conflicting previous declaration. + DeclarationNameInfo NameInfo(VD->getDeclName(), D.DeclIdentLoc); + LookupResult Previous(SemaRef, NameInfo, Sema::LookupOrdinaryName, + Sema::ForVisibleRedeclaration); + Previous.suppressDiagnostics(); + SemaRef.LookupName(Previous, S); + + SemaRef.FilterLookupForScope(Previous, SemaRef.CurContext, S, + /*ConsiderLinkage=*/false, + /*AllowInlineNamespace=*/false); + if (!Previous.empty()) { + NamedDecl *Old = Previous.getRepresentativeDecl(); + Diag(D.DeclIdentLoc, diag::err_redefinition) << VD->getDeclName(); + Diag(Old->getLocation(), diag::note_previous_definition); + } else { + SemaRef.PushOnScopeChains(VD, S); + } + } else { + SemaRef.CurContext->addDecl(VD); + } + + /// Act on the iterator variable declaration. + ActOnOpenMPIteratorVarDecl(VD); + + Expr *Begin = D.Range.Begin; + if (!IsDeclTyDependent && Begin && !Begin->isTypeDependent()) { + ExprResult BeginRes = + SemaRef.PerformImplicitConversion(Begin, DeclTy, Sema::AA_Converting); + Begin = BeginRes.get(); + } + Expr *End = D.Range.End; + if (!IsDeclTyDependent && End && !End->isTypeDependent()) { + ExprResult EndRes = + SemaRef.PerformImplicitConversion(End, DeclTy, Sema::AA_Converting); + End = EndRes.get(); + } + Expr *Step = D.Range.Step; + if (!IsDeclTyDependent && Step && !Step->isTypeDependent()) { + if (!Step->getType()->isIntegralType(Context)) { + Diag(Step->getExprLoc(), diag::err_omp_iterator_step_not_integral) + << Step << Step->getSourceRange(); + IsCorrect = false; + continue; + } + std::optional Result = + Step->getIntegerConstantExpr(Context); + // OpenMP 5.0, 2.1.6 Iterators, Restrictions + // If the step expression of a range-specification equals zero, the + // behavior is unspecified. + if (Result && Result->isZero()) { + Diag(Step->getExprLoc(), diag::err_omp_iterator_step_constant_zero) + << Step << Step->getSourceRange(); + IsCorrect = false; + continue; + } + } + if (!Begin || !End || !IsCorrect) { + IsCorrect = false; + continue; + } + OMPIteratorExpr::IteratorDefinition &IDElem = ID.emplace_back(); + IDElem.IteratorDecl = VD; + IDElem.AssignmentLoc = D.AssignLoc; + IDElem.Range.Begin = Begin; + IDElem.Range.End = End; + IDElem.Range.Step = Step; + IDElem.ColonLoc = D.ColonLoc; + IDElem.SecondColonLoc = D.SecColonLoc; + } + if (!IsCorrect) { + // Invalidate all created iterator declarations if error is found. + for (const OMPIteratorExpr::IteratorDefinition &D : ID) { + if (Decl *ID = D.IteratorDecl) + ID->setInvalidDecl(); + } + return ExprError(); + } + SmallVector Helpers; + if (!SemaRef.CurContext->isDependentContext()) { + // Build number of ityeration for each iteration range. + // Ni = ((Stepi > 0) ? ((Endi + Stepi -1 - Begini)/Stepi) : + // ((Begini-Stepi-1-Endi) / -Stepi); + for (OMPIteratorExpr::IteratorDefinition &D : ID) { + // (Endi - Begini) + ExprResult Res = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Sub, + D.Range.End, D.Range.Begin); + if (!Res.isUsable()) { + IsCorrect = false; + continue; + } + ExprResult St, St1; + if (D.Range.Step) { + St = D.Range.Step; + // (Endi - Begini) + Stepi + Res = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, Res.get(), + St.get()); + if (!Res.isUsable()) { + IsCorrect = false; + continue; + } + // (Endi - Begini) + Stepi - 1 + Res = SemaRef.CreateBuiltinBinOp( + D.AssignmentLoc, BO_Sub, Res.get(), + SemaRef.ActOnIntegerConstant(D.AssignmentLoc, 1).get()); + if (!Res.isUsable()) { + IsCorrect = false; + continue; + } + // ((Endi - Begini) + Stepi - 1) / Stepi + Res = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Div, Res.get(), + St.get()); + if (!Res.isUsable()) { + IsCorrect = false; + continue; + } + St1 = SemaRef.CreateBuiltinUnaryOp(D.AssignmentLoc, UO_Minus, + D.Range.Step); + // (Begini - Endi) + ExprResult Res1 = SemaRef.CreateBuiltinBinOp( + D.AssignmentLoc, BO_Sub, D.Range.Begin, D.Range.End); + if (!Res1.isUsable()) { + IsCorrect = false; + continue; + } + // (Begini - Endi) - Stepi + Res1 = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, Res1.get(), + St1.get()); + if (!Res1.isUsable()) { + IsCorrect = false; + continue; + } + // (Begini - Endi) - Stepi - 1 + Res1 = SemaRef.CreateBuiltinBinOp( + D.AssignmentLoc, BO_Sub, Res1.get(), + SemaRef.ActOnIntegerConstant(D.AssignmentLoc, 1).get()); + if (!Res1.isUsable()) { + IsCorrect = false; + continue; + } + // ((Begini - Endi) - Stepi - 1) / (-Stepi) + Res1 = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Div, Res1.get(), + St1.get()); + if (!Res1.isUsable()) { + IsCorrect = false; + continue; + } + // Stepi > 0. + ExprResult CmpRes = SemaRef.CreateBuiltinBinOp( + D.AssignmentLoc, BO_GT, D.Range.Step, + SemaRef.ActOnIntegerConstant(D.AssignmentLoc, 0).get()); + if (!CmpRes.isUsable()) { + IsCorrect = false; + continue; + } + Res = SemaRef.ActOnConditionalOp(D.AssignmentLoc, D.AssignmentLoc, + CmpRes.get(), Res.get(), Res1.get()); + if (!Res.isUsable()) { + IsCorrect = false; + continue; + } + } + Res = SemaRef.ActOnFinishFullExpr(Res.get(), /*DiscardedValue=*/false); + if (!Res.isUsable()) { + IsCorrect = false; + continue; + } + + // Build counter update. + // Build counter. + auto *CounterVD = VarDecl::Create(Context, SemaRef.CurContext, + D.IteratorDecl->getBeginLoc(), + D.IteratorDecl->getBeginLoc(), nullptr, + Res.get()->getType(), nullptr, SC_None); + CounterVD->setImplicit(); + ExprResult RefRes = + SemaRef.BuildDeclRefExpr(CounterVD, CounterVD->getType(), VK_LValue, + D.IteratorDecl->getBeginLoc()); + // Build counter update. + // I = Begini + counter * Stepi; + ExprResult UpdateRes; + if (D.Range.Step) { + UpdateRes = SemaRef.CreateBuiltinBinOp( + D.AssignmentLoc, BO_Mul, + SemaRef.DefaultLvalueConversion(RefRes.get()).get(), St.get()); + } else { + UpdateRes = SemaRef.DefaultLvalueConversion(RefRes.get()); + } + if (!UpdateRes.isUsable()) { + IsCorrect = false; + continue; + } + UpdateRes = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, + D.Range.Begin, UpdateRes.get()); + if (!UpdateRes.isUsable()) { + IsCorrect = false; + continue; + } + ExprResult VDRes = + SemaRef.BuildDeclRefExpr(cast(D.IteratorDecl), + cast(D.IteratorDecl)->getType(), + VK_LValue, D.IteratorDecl->getBeginLoc()); + UpdateRes = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Assign, + VDRes.get(), UpdateRes.get()); + if (!UpdateRes.isUsable()) { + IsCorrect = false; + continue; + } + UpdateRes = + SemaRef.ActOnFinishFullExpr(UpdateRes.get(), /*DiscardedValue=*/true); + if (!UpdateRes.isUsable()) { + IsCorrect = false; + continue; + } + ExprResult CounterUpdateRes = SemaRef.CreateBuiltinUnaryOp( + D.AssignmentLoc, UO_PreInc, RefRes.get()); + if (!CounterUpdateRes.isUsable()) { + IsCorrect = false; + continue; + } + CounterUpdateRes = SemaRef.ActOnFinishFullExpr(CounterUpdateRes.get(), + /*DiscardedValue=*/true); + if (!CounterUpdateRes.isUsable()) { + IsCorrect = false; + continue; + } + OMPIteratorHelperData &HD = Helpers.emplace_back(); + HD.CounterVD = CounterVD; + HD.Upper = Res.get(); + HD.Update = UpdateRes.get(); + HD.CounterUpdate = CounterUpdateRes.get(); + } + } else { + Helpers.assign(ID.size(), {}); + } + if (!IsCorrect) { + // Invalidate all created iterator declarations if error is found. + for (const OMPIteratorExpr::IteratorDefinition &D : ID) { + if (Decl *ID = D.IteratorDecl) + ID->setInvalidDecl(); + } + return ExprError(); + } + return OMPIteratorExpr::Create(Context, Context.OMPIteratorTy, IteratorKwLoc, + LLoc, RLoc, ID, Helpers); +} + +SemaOpenMP::SemaOpenMP(Sema &S) + : SemaBase(S), VarDataSharingAttributesStack(nullptr) {} diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index d28c24cfdfd33c..a7b33f0db047eb 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -35,6 +35,7 @@ #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaInternal.h" +#include "clang/Sema/SemaOpenMP.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" @@ -3097,7 +3098,7 @@ StmtResult Sema::BuildCXXForRangeStmt( // In OpenMP loop region loop control variable must be private. Perform // analysis of first part (if any). if (getLangOpts().OpenMP >= 50 && BeginDeclStmt.isUsable()) - ActOnOpenMPLoopInitialization(ForLoc, BeginDeclStmt.get()); + OpenMP().ActOnOpenMPLoopInitialization(ForLoc, BeginDeclStmt.get()); return new (Context) CXXForRangeStmt( InitStmt, RangeDS, cast_or_null(BeginDeclStmt.get()), @@ -4822,7 +4823,8 @@ buildCapturedStmtCaptureList(Sema &S, CapturedRegionScopeInfo *RSI, assert(Cap.isVariableCapture() && "unknown kind of capture"); if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) - S.setOpenMPCaptureKind(Field, Cap.getVariable(), RSI->OpenMPLevel); + S.OpenMP().setOpenMPCaptureKind(Field, Cap.getVariable(), + RSI->OpenMPLevel); Captures.push_back(CapturedStmt::Capture( Cap.getLocation(), diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index c45a8d1408fff3..6d359c5a9a024c 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -28,6 +28,7 @@ #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaInternal.h" +#include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/Template.h" #include "clang/Sema/TemplateInstCallback.h" #include "llvm/Support/TimeProfiler.h" @@ -399,7 +400,7 @@ static void instantiateOMPDeclareSimdDeclAttr( ++SI; } LinModifiers.append(Attr.modifiers_begin(), Attr.modifiers_end()); - (void)S.ActOnOpenMPDeclareSimdDirective( + (void)S.OpenMP().ActOnOpenMPDeclareSimdDirective( S.ConvertDeclToDeclGroup(New), Attr.getBranchState(), Simdlen.get(), Uniforms, Aligneds, Alignments, Linears, LinModifiers, Steps, Attr.getRange()); @@ -476,9 +477,9 @@ static void instantiateOMPDeclareVariantAttr( // Check function/variant ref for `omp declare variant` but not for `omp // begin declare variant` (which use implicit attributes). std::optional> DeclVarData = - S.checkOpenMPDeclareVariantFunction(S.ConvertDeclToDeclGroup(New), E, TI, - Attr.appendArgs_size(), - Attr.getRange()); + S.OpenMP().checkOpenMPDeclareVariantFunction( + S.ConvertDeclToDeclGroup(New), E, TI, Attr.appendArgs_size(), + Attr.getRange()); if (!DeclVarData) return; @@ -539,7 +540,7 @@ static void instantiateOMPDeclareVariantAttr( AppendArgs.emplace_back(II.IsTarget, II.IsTargetSync); } - S.ActOnOpenMPDeclareVariantDirective( + S.OpenMP().ActOnOpenMPDeclareVariantDirective( FD, E, TI, NothingExprs, NeedDevicePtrExprs, AppendArgs, SourceLocation(), SourceLocation(), Attr.getRange()); } @@ -3587,7 +3588,7 @@ Decl *TemplateDeclInstantiator::VisitOMPThreadPrivateDecl( } OMPThreadPrivateDecl *TD = - SemaRef.CheckOMPThreadPrivateDecl(D->getLocation(), Vars); + SemaRef.OpenMP().CheckOMPThreadPrivateDecl(D->getLocation(), Vars); TD->setAccess(AS_public); Owner->addDecl(TD); @@ -3610,14 +3611,14 @@ Decl *TemplateDeclInstantiator::VisitOMPAllocateDecl(OMPAllocateDecl *D) { ExprResult NewE = SemaRef.SubstExpr(AC->getAllocator(), TemplateArgs); if (!NewE.isUsable()) continue; - IC = SemaRef.ActOnOpenMPAllocatorClause( + IC = SemaRef.OpenMP().ActOnOpenMPAllocatorClause( NewE.get(), AC->getBeginLoc(), AC->getLParenLoc(), AC->getEndLoc()); } else if (auto *AC = dyn_cast(C)) { ExprResult NewE = SemaRef.SubstExpr(AC->getAlignment(), TemplateArgs); if (!NewE.isUsable()) continue; - IC = SemaRef.ActOnOpenMPAlignClause(NewE.get(), AC->getBeginLoc(), - AC->getLParenLoc(), AC->getEndLoc()); + IC = SemaRef.OpenMP().ActOnOpenMPAlignClause( + NewE.get(), AC->getBeginLoc(), AC->getLParenLoc(), AC->getEndLoc()); // If align clause value ends up being invalid, this can end up null. if (!IC) continue; @@ -3625,7 +3626,7 @@ Decl *TemplateDeclInstantiator::VisitOMPAllocateDecl(OMPAllocateDecl *D) { Clauses.push_back(IC); } - Sema::DeclGroupPtrTy Res = SemaRef.ActOnOpenMPAllocateDirective( + Sema::DeclGroupPtrTy Res = SemaRef.OpenMP().ActOnOpenMPAllocateDirective( D->getLocation(), Vars, Clauses, Owner); if (Res.get().isNull()) return nullptr; @@ -3646,7 +3647,7 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl( D->getType()->containsUnexpandedParameterPack(); QualType SubstReductionType; if (RequiresInstantiation) { - SubstReductionType = SemaRef.ActOnOpenMPDeclareReductionType( + SubstReductionType = SemaRef.OpenMP().ActOnOpenMPDeclareReductionType( D->getLocation(), ParsedType::make(SemaRef.SubstType( D->getType(), TemplateArgs, D->getLocation(), DeclarationName()))); @@ -3667,7 +3668,7 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl( SemaRef.CurrentInstantiationScope->findInstantiationOf(PrevDeclInScope) ->get()); } - auto DRD = SemaRef.ActOnOpenMPDeclareReductionDirectiveStart( + auto DRD = SemaRef.OpenMP().ActOnOpenMPDeclareReductionDirectiveStart( /*S=*/nullptr, Owner, D->getDeclName(), ReductionTypes, D->getAccess(), PrevDeclInScope); auto *NewDRD = cast(DRD.get().getSingleDecl()); @@ -3676,7 +3677,7 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl( Expr *SubstInitializer = nullptr; // Combiners instantiation sequence. if (Combiner) { - SemaRef.ActOnOpenMPDeclareReductionCombinerStart( + SemaRef.OpenMP().ActOnOpenMPDeclareReductionCombinerStart( /*S=*/nullptr, NewDRD); SemaRef.CurrentInstantiationScope->InstantiatedLocal( cast(D->getCombinerIn())->getDecl(), @@ -3688,12 +3689,14 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl( Sema::CXXThisScopeRAII ThisScope(SemaRef, ThisContext, Qualifiers(), ThisContext); SubstCombiner = SemaRef.SubstExpr(Combiner, TemplateArgs).get(); - SemaRef.ActOnOpenMPDeclareReductionCombinerEnd(NewDRD, SubstCombiner); + SemaRef.OpenMP().ActOnOpenMPDeclareReductionCombinerEnd(NewDRD, + SubstCombiner); } // Initializers instantiation sequence. if (Init) { - VarDecl *OmpPrivParm = SemaRef.ActOnOpenMPDeclareReductionInitializerStart( - /*S=*/nullptr, NewDRD); + VarDecl *OmpPrivParm = + SemaRef.OpenMP().ActOnOpenMPDeclareReductionInitializerStart( + /*S=*/nullptr, NewDRD); SemaRef.CurrentInstantiationScope->InstantiatedLocal( cast(D->getInitOrig())->getDecl(), cast(NewDRD->getInitOrig())->getDecl()); @@ -3710,8 +3713,8 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl( SemaRef.InstantiateVariableInitializer(OmpPrivParm, OldPrivParm, TemplateArgs); } - SemaRef.ActOnOpenMPDeclareReductionInitializerEnd(NewDRD, SubstInitializer, - OmpPrivParm); + SemaRef.OpenMP().ActOnOpenMPDeclareReductionInitializerEnd( + NewDRD, SubstInitializer, OmpPrivParm); } IsCorrect = IsCorrect && SubstCombiner && (!Init || @@ -3720,7 +3723,7 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl( (D->getInitializerKind() != OMPDeclareReductionInitKind::Call && !SubstInitializer)); - (void)SemaRef.ActOnOpenMPDeclareReductionDirectiveEnd( + (void)SemaRef.OpenMP().ActOnOpenMPDeclareReductionDirectiveEnd( /*S=*/nullptr, DRD, IsCorrect && !D->isInvalidDecl()); return NewDRD; @@ -3736,7 +3739,7 @@ TemplateDeclInstantiator::VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D) { QualType SubstMapperTy; DeclarationName VN = D->getVarName(); if (RequiresInstantiation) { - SubstMapperTy = SemaRef.ActOnOpenMPDeclareMapperType( + SubstMapperTy = SemaRef.OpenMP().ActOnOpenMPDeclareMapperType( D->getLocation(), ParsedType::make(SemaRef.SubstType(D->getType(), TemplateArgs, D->getLocation(), VN))); @@ -3756,11 +3759,12 @@ TemplateDeclInstantiator::VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D) { SmallVector Clauses; // Instantiate the mapper variable. DeclarationNameInfo DirName; - SemaRef.StartOpenMPDSABlock(llvm::omp::OMPD_declare_mapper, DirName, - /*S=*/nullptr, - (*D->clauselist_begin())->getBeginLoc()); - ExprResult MapperVarRef = SemaRef.ActOnOpenMPDeclareMapperDirectiveVarDecl( - /*S=*/nullptr, SubstMapperTy, D->getLocation(), VN); + SemaRef.OpenMP().StartOpenMPDSABlock(llvm::omp::OMPD_declare_mapper, DirName, + /*S=*/nullptr, + (*D->clauselist_begin())->getBeginLoc()); + ExprResult MapperVarRef = + SemaRef.OpenMP().ActOnOpenMPDeclareMapperDirectiveVarDecl( + /*S=*/nullptr, SubstMapperTy, D->getLocation(), VN); SemaRef.CurrentInstantiationScope->InstantiatedLocal( cast(D->getMapperVarRef())->getDecl(), cast(MapperVarRef.get())->getDecl()); @@ -3790,17 +3794,17 @@ TemplateDeclInstantiator::VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D) { SemaRef.SubstDeclarationNameInfo(OldC->getMapperIdInfo(), TemplateArgs); OMPVarListLocTy Locs(OldC->getBeginLoc(), OldC->getLParenLoc(), OldC->getEndLoc()); - OMPClause *NewC = SemaRef.ActOnOpenMPMapClause( + OMPClause *NewC = SemaRef.OpenMP().ActOnOpenMPMapClause( OldC->getIteratorModifier(), OldC->getMapTypeModifiers(), OldC->getMapTypeModifiersLoc(), SS, NewNameInfo, OldC->getMapType(), OldC->isImplicitMapType(), OldC->getMapLoc(), OldC->getColonLoc(), NewVars, Locs); Clauses.push_back(NewC); } - SemaRef.EndOpenMPDSABlock(nullptr); + SemaRef.OpenMP().EndOpenMPDSABlock(nullptr); if (!IsCorrect) return nullptr; - Sema::DeclGroupPtrTy DG = SemaRef.ActOnOpenMPDeclareMapperDirective( + Sema::DeclGroupPtrTy DG = SemaRef.OpenMP().ActOnOpenMPDeclareMapperDirective( /*S=*/nullptr, Owner, D->getDeclName(), SubstMapperTy, D->getLocation(), VN, D->getAccess(), MapperVarRef.get(), Clauses, PrevDeclInScope); Decl *NewDMD = DG.get().getSingleDecl(); diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 404c4e8e31b558..1b31df8d97fba2 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -35,6 +35,7 @@ #include "clang/Sema/ScopeInfo.h" #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaInternal.h" +#include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/Template.h" #include "clang/Sema/TemplateInstCallback.h" #include "llvm/ADT/ArrayRef.h" @@ -2640,7 +2641,7 @@ QualType Sema::BuildArrayType(QualType T, ArraySizeModifier ASM, } else if (isSFINAEContext()) { VLADiag = diag::err_vla_in_sfinae; VLAIsError = true; - } else if (getLangOpts().OpenMP && isInOpenMPTaskUntiedContext()) { + } else if (getLangOpts().OpenMP && OpenMP().isInOpenMPTaskUntiedContext()) { VLADiag = diag::err_openmp_vla_in_task_untied; VLAIsError = true; } else if (getLangOpts().CPlusPlus) { diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 9d15f3eacbb0f4..acc2d7ff9d427c 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -40,6 +40,7 @@ #include "clang/Sema/SemaDiagnostic.h" #include "clang/Sema/SemaInternal.h" #include "clang/Sema/SemaOpenACC.h" +#include "clang/Sema/SemaOpenMP.h" #include "clang/Sema/SemaSYCL.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/ErrorHandling.h" @@ -1656,7 +1657,7 @@ class TreeTransform { /// Ensures that the outermost loop in @p LoopStmt is wrapped by a /// OMPCanonicalLoop. StmtResult RebuildOMPCanonicalLoop(Stmt *LoopStmt) { - return getSema().ActOnOpenMPCanonicalLoop(LoopStmt); + return getSema().OpenMP().ActOnOpenMPCanonicalLoop(LoopStmt); } /// Build a new OpenMP executable directive. @@ -1669,7 +1670,7 @@ class TreeTransform { Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc, OpenMPDirectiveKind PrevMappedDirective = OMPD_unknown) { - return getSema().ActOnOpenMPExecutableDirective( + return getSema().OpenMP().ActOnOpenMPExecutableDirective( Kind, DirName, CancelRegion, Clauses, AStmt, StartLoc, EndLoc, PrevMappedDirective); } @@ -1684,9 +1685,9 @@ class TreeTransform { SourceLocation NameModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPIfClause(NameModifier, Condition, StartLoc, - LParenLoc, NameModifierLoc, ColonLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPIfClause( + NameModifier, Condition, StartLoc, LParenLoc, NameModifierLoc, ColonLoc, + EndLoc); } /// Build a new OpenMP 'final' clause. @@ -1696,8 +1697,8 @@ class TreeTransform { OMPClause *RebuildOMPFinalClause(Expr *Condition, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPFinalClause(Condition, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPFinalClause(Condition, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'num_threads' clause. @@ -1708,8 +1709,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPNumThreadsClause(NumThreads, StartLoc, - LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPNumThreadsClause(NumThreads, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'safelen' clause. @@ -1719,7 +1720,8 @@ class TreeTransform { OMPClause *RebuildOMPSafelenClause(Expr *Len, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPSafelenClause(Len, StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPSafelenClause(Len, StartLoc, LParenLoc, + EndLoc); } /// Build a new OpenMP 'simdlen' clause. @@ -1729,28 +1731,30 @@ class TreeTransform { OMPClause *RebuildOMPSimdlenClause(Expr *Len, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPSimdlenClause(Len, StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPSimdlenClause(Len, StartLoc, LParenLoc, + EndLoc); } OMPClause *RebuildOMPSizesClause(ArrayRef Sizes, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPSizesClause(Sizes, StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPSizesClause(Sizes, StartLoc, LParenLoc, + EndLoc); } /// Build a new OpenMP 'full' clause. OMPClause *RebuildOMPFullClause(SourceLocation StartLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPFullClause(StartLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPFullClause(StartLoc, EndLoc); } /// Build a new OpenMP 'partial' clause. OMPClause *RebuildOMPPartialClause(Expr *Factor, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPPartialClause(Factor, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPPartialClause(Factor, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'allocator' clause. @@ -1760,7 +1764,8 @@ class TreeTransform { OMPClause *RebuildOMPAllocatorClause(Expr *A, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPAllocatorClause(A, StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPAllocatorClause(A, StartLoc, LParenLoc, + EndLoc); } /// Build a new OpenMP 'collapse' clause. @@ -1770,8 +1775,8 @@ class TreeTransform { OMPClause *RebuildOMPCollapseClause(Expr *Num, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPCollapseClause(Num, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPCollapseClause(Num, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'default' clause. @@ -1782,8 +1787,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPDefaultClause(Kind, KindKwLoc, - StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPDefaultClause( + Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc); } /// Build a new OpenMP 'proc_bind' clause. @@ -1795,8 +1800,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPProcBindClause(Kind, KindKwLoc, - StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPProcBindClause( + Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc); } /// Build a new OpenMP 'schedule' clause. @@ -1808,7 +1813,7 @@ class TreeTransform { OpenMPScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc, SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPScheduleClause( + return getSema().OpenMP().ActOnOpenMPScheduleClause( M1, M2, Kind, ChunkSize, StartLoc, LParenLoc, M1Loc, M2Loc, KindLoc, CommaLoc, EndLoc); } @@ -1820,7 +1825,8 @@ class TreeTransform { OMPClause *RebuildOMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc, SourceLocation LParenLoc, Expr *Num) { - return getSema().ActOnOpenMPOrderedClause(StartLoc, EndLoc, LParenLoc, Num); + return getSema().OpenMP().ActOnOpenMPOrderedClause(StartLoc, EndLoc, + LParenLoc, Num); } /// Build a new OpenMP 'private' clause. @@ -1831,8 +1837,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPPrivateClause(VarList, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPPrivateClause(VarList, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'firstprivate' clause. @@ -1843,8 +1849,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPFirstprivateClause(VarList, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPFirstprivateClause(VarList, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'lastprivate' clause. @@ -1858,7 +1864,7 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPLastprivateClause( + return getSema().OpenMP().ActOnOpenMPLastprivateClause( VarList, LPKind, LPKindLoc, ColonLoc, StartLoc, LParenLoc, EndLoc); } @@ -1870,8 +1876,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPSharedClause(VarList, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPSharedClause(VarList, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'reduction' clause. @@ -1885,7 +1891,7 @@ class TreeTransform { SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId, ArrayRef UnresolvedReductions) { - return getSema().ActOnOpenMPReductionClause( + return getSema().OpenMP().ActOnOpenMPReductionClause( VarList, Modifier, StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc, ReductionIdScopeSpec, ReductionId, UnresolvedReductions); } @@ -1900,7 +1906,7 @@ class TreeTransform { CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId, ArrayRef UnresolvedReductions) { - return getSema().ActOnOpenMPTaskReductionClause( + return getSema().OpenMP().ActOnOpenMPTaskReductionClause( VarList, StartLoc, LParenLoc, ColonLoc, EndLoc, ReductionIdScopeSpec, ReductionId, UnresolvedReductions); } @@ -1916,7 +1922,7 @@ class TreeTransform { CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId, ArrayRef UnresolvedReductions) { - return getSema().ActOnOpenMPInReductionClause( + return getSema().OpenMP().ActOnOpenMPInReductionClause( VarList, StartLoc, LParenLoc, ColonLoc, EndLoc, ReductionIdScopeSpec, ReductionId, UnresolvedReductions); } @@ -1930,9 +1936,9 @@ class TreeTransform { SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier, SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation StepModifierLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPLinearClause(VarList, Step, StartLoc, LParenLoc, - Modifier, ModifierLoc, ColonLoc, - StepModifierLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPLinearClause( + VarList, Step, StartLoc, LParenLoc, Modifier, ModifierLoc, ColonLoc, + StepModifierLoc, EndLoc); } /// Build a new OpenMP 'aligned' clause. @@ -1944,8 +1950,8 @@ class TreeTransform { SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPAlignedClause(VarList, Alignment, StartLoc, - LParenLoc, ColonLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPAlignedClause( + VarList, Alignment, StartLoc, LParenLoc, ColonLoc, EndLoc); } /// Build a new OpenMP 'copyin' clause. @@ -1956,8 +1962,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPCopyinClause(VarList, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPCopyinClause(VarList, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'copyprivate' clause. @@ -1968,8 +1974,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPCopyprivateClause(VarList, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPCopyprivateClause(VarList, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'flush' pseudo clause. @@ -1980,8 +1986,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPFlushClause(VarList, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPFlushClause(VarList, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'depobj' pseudo clause. @@ -1991,8 +1997,8 @@ class TreeTransform { OMPClause *RebuildOMPDepobjClause(Expr *Depobj, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPDepobjClause(Depobj, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPDepobjClause(Depobj, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'depend' pseudo clause. @@ -2004,8 +2010,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPDependClause(Data, DepModifier, VarList, - StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPDependClause( + Data, DepModifier, VarList, StartLoc, LParenLoc, EndLoc); } /// Build a new OpenMP 'device' clause. @@ -2017,8 +2023,8 @@ class TreeTransform { SourceLocation LParenLoc, SourceLocation ModifierLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPDeviceClause(Modifier, Device, StartLoc, - LParenLoc, ModifierLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPDeviceClause( + Modifier, Device, StartLoc, LParenLoc, ModifierLoc, EndLoc); } /// Build a new OpenMP 'map' clause. @@ -2032,7 +2038,7 @@ class TreeTransform { OpenMPMapClauseKind MapType, bool IsMapTypeImplicit, SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef VarList, const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers) { - return getSema().ActOnOpenMPMapClause( + return getSema().OpenMP().ActOnOpenMPMapClause( IteratorModifier, MapTypeModifiers, MapTypeModifiersLoc, MapperIdScopeSpec, MapperId, MapType, IsMapTypeImplicit, MapLoc, ColonLoc, VarList, Locs, @@ -2048,8 +2054,8 @@ class TreeTransform { SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPAllocateClause(Allocate, VarList, StartLoc, - LParenLoc, ColonLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPAllocateClause( + Allocate, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc); } /// Build a new OpenMP 'num_teams' clause. @@ -2059,8 +2065,8 @@ class TreeTransform { OMPClause *RebuildOMPNumTeamsClause(Expr *NumTeams, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPNumTeamsClause(NumTeams, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPNumTeamsClause(NumTeams, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'thread_limit' clause. @@ -2071,8 +2077,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPThreadLimitClause(ThreadLimit, StartLoc, - LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPThreadLimitClause( + ThreadLimit, StartLoc, LParenLoc, EndLoc); } /// Build a new OpenMP 'priority' clause. @@ -2082,8 +2088,8 @@ class TreeTransform { OMPClause *RebuildOMPPriorityClause(Expr *Priority, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPPriorityClause(Priority, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPPriorityClause(Priority, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'grainsize' clause. @@ -2095,8 +2101,8 @@ class TreeTransform { SourceLocation LParenLoc, SourceLocation ModifierLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPGrainsizeClause(Modifier, Device, StartLoc, - LParenLoc, ModifierLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPGrainsizeClause( + Modifier, Device, StartLoc, LParenLoc, ModifierLoc, EndLoc); } /// Build a new OpenMP 'num_tasks' clause. @@ -2108,8 +2114,8 @@ class TreeTransform { SourceLocation LParenLoc, SourceLocation ModifierLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPNumTasksClause(Modifier, NumTasks, StartLoc, - LParenLoc, ModifierLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPNumTasksClause( + Modifier, NumTasks, StartLoc, LParenLoc, ModifierLoc, EndLoc); } /// Build a new OpenMP 'hint' clause. @@ -2119,7 +2125,8 @@ class TreeTransform { OMPClause *RebuildOMPHintClause(Expr *Hint, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPHintClause(Hint, StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPHintClause(Hint, StartLoc, LParenLoc, + EndLoc); } /// Build a new OpenMP 'detach' clause. @@ -2129,7 +2136,8 @@ class TreeTransform { OMPClause *RebuildOMPDetachClause(Expr *Evt, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPDetachClause(Evt, StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPDetachClause(Evt, StartLoc, LParenLoc, + EndLoc); } /// Build a new OpenMP 'dist_schedule' clause. @@ -2141,7 +2149,7 @@ class TreeTransform { Expr *ChunkSize, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPDistScheduleClause( + return getSema().OpenMP().ActOnOpenMPDistScheduleClause( Kind, ChunkSize, StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc); } @@ -2156,9 +2164,9 @@ class TreeTransform { DeclarationNameInfo &MapperId, SourceLocation ColonLoc, ArrayRef VarList, const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers) { - return getSema().ActOnOpenMPToClause(MotionModifiers, MotionModifiersLoc, - MapperIdScopeSpec, MapperId, ColonLoc, - VarList, Locs, UnresolvedMappers); + return getSema().OpenMP().ActOnOpenMPToClause( + MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId, + ColonLoc, VarList, Locs, UnresolvedMappers); } /// Build a new OpenMP 'from' clause. @@ -2172,7 +2180,7 @@ class TreeTransform { DeclarationNameInfo &MapperId, SourceLocation ColonLoc, ArrayRef VarList, const OMPVarListLocTy &Locs, ArrayRef UnresolvedMappers) { - return getSema().ActOnOpenMPFromClause( + return getSema().OpenMP().ActOnOpenMPFromClause( MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId, ColonLoc, VarList, Locs, UnresolvedMappers); } @@ -2183,7 +2191,7 @@ class TreeTransform { /// Subclasses may override this routine to provide different behavior. OMPClause *RebuildOMPUseDevicePtrClause(ArrayRef VarList, const OMPVarListLocTy &Locs) { - return getSema().ActOnOpenMPUseDevicePtrClause(VarList, Locs); + return getSema().OpenMP().ActOnOpenMPUseDevicePtrClause(VarList, Locs); } /// Build a new OpenMP 'use_device_addr' clause. @@ -2192,7 +2200,7 @@ class TreeTransform { /// Subclasses may override this routine to provide different behavior. OMPClause *RebuildOMPUseDeviceAddrClause(ArrayRef VarList, const OMPVarListLocTy &Locs) { - return getSema().ActOnOpenMPUseDeviceAddrClause(VarList, Locs); + return getSema().OpenMP().ActOnOpenMPUseDeviceAddrClause(VarList, Locs); } /// Build a new OpenMP 'is_device_ptr' clause. @@ -2201,7 +2209,7 @@ class TreeTransform { /// Subclasses may override this routine to provide different behavior. OMPClause *RebuildOMPIsDevicePtrClause(ArrayRef VarList, const OMPVarListLocTy &Locs) { - return getSema().ActOnOpenMPIsDevicePtrClause(VarList, Locs); + return getSema().OpenMP().ActOnOpenMPIsDevicePtrClause(VarList, Locs); } /// Build a new OpenMP 'has_device_addr' clause. @@ -2210,7 +2218,7 @@ class TreeTransform { /// Subclasses may override this routine to provide different behavior. OMPClause *RebuildOMPHasDeviceAddrClause(ArrayRef VarList, const OMPVarListLocTy &Locs) { - return getSema().ActOnOpenMPHasDeviceAddrClause(VarList, Locs); + return getSema().OpenMP().ActOnOpenMPHasDeviceAddrClause(VarList, Locs); } /// Build a new OpenMP 'defaultmap' clause. @@ -2224,8 +2232,8 @@ class TreeTransform { SourceLocation MLoc, SourceLocation KindLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPDefaultmapClause(M, Kind, StartLoc, LParenLoc, - MLoc, KindLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPDefaultmapClause( + M, Kind, StartLoc, LParenLoc, MLoc, KindLoc, EndLoc); } /// Build a new OpenMP 'nontemporal' clause. @@ -2236,8 +2244,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPNontemporalClause(VarList, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPNontemporalClause(VarList, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'inclusive' clause. @@ -2248,8 +2256,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPInclusiveClause(VarList, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPInclusiveClause(VarList, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'exclusive' clause. @@ -2260,8 +2268,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPExclusiveClause(VarList, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPExclusiveClause(VarList, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'uses_allocators' clause. @@ -2269,10 +2277,10 @@ class TreeTransform { /// By default, performs semantic analysis to build the new OpenMP clause. /// Subclasses may override this routine to provide different behavior. OMPClause *RebuildOMPUsesAllocatorsClause( - ArrayRef Data, SourceLocation StartLoc, + ArrayRef Data, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPUsesAllocatorClause(StartLoc, LParenLoc, EndLoc, - Data); + return getSema().OpenMP().ActOnOpenMPUsesAllocatorClause( + StartLoc, LParenLoc, EndLoc, Data); } /// Build a new OpenMP 'affinity' clause. @@ -2284,8 +2292,8 @@ class TreeTransform { SourceLocation ColonLoc, SourceLocation EndLoc, Expr *Modifier, ArrayRef Locators) { - return getSema().ActOnOpenMPAffinityClause(StartLoc, LParenLoc, ColonLoc, - EndLoc, Modifier, Locators); + return getSema().OpenMP().ActOnOpenMPAffinityClause( + StartLoc, LParenLoc, ColonLoc, EndLoc, Modifier, Locators); } /// Build a new OpenMP 'order' clause. @@ -2296,8 +2304,8 @@ class TreeTransform { OpenMPOrderClauseKind Kind, SourceLocation KindKwLoc, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc, OpenMPOrderClauseModifier Modifier, SourceLocation ModifierKwLoc) { - return getSema().ActOnOpenMPOrderClause(Modifier, Kind, StartLoc, LParenLoc, - ModifierKwLoc, KindKwLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPOrderClause( + Modifier, Kind, StartLoc, LParenLoc, ModifierKwLoc, KindKwLoc, EndLoc); } /// Build a new OpenMP 'init' clause. @@ -2309,8 +2317,8 @@ class TreeTransform { SourceLocation LParenLoc, SourceLocation VarLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPInitClause(InteropVar, InteropInfo, StartLoc, - LParenLoc, VarLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPInitClause( + InteropVar, InteropInfo, StartLoc, LParenLoc, VarLoc, EndLoc); } /// Build a new OpenMP 'use' clause. @@ -2320,8 +2328,8 @@ class TreeTransform { OMPClause *RebuildOMPUseClause(Expr *InteropVar, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation VarLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPUseClause(InteropVar, StartLoc, LParenLoc, - VarLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPUseClause(InteropVar, StartLoc, + LParenLoc, VarLoc, EndLoc); } /// Build a new OpenMP 'destroy' clause. @@ -2332,8 +2340,8 @@ class TreeTransform { SourceLocation LParenLoc, SourceLocation VarLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPDestroyClause(InteropVar, StartLoc, LParenLoc, - VarLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPDestroyClause( + InteropVar, StartLoc, LParenLoc, VarLoc, EndLoc); } /// Build a new OpenMP 'novariants' clause. @@ -2344,8 +2352,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPNovariantsClause(Condition, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPNovariantsClause(Condition, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'nocontext' clause. @@ -2355,8 +2363,8 @@ class TreeTransform { OMPClause *RebuildOMPNocontextClause(Expr *Condition, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPNocontextClause(Condition, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPNocontextClause(Condition, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'filter' clause. @@ -2366,8 +2374,8 @@ class TreeTransform { OMPClause *RebuildOMPFilterClause(Expr *ThreadID, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPFilterClause(ThreadID, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPFilterClause(ThreadID, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'bind' clause. @@ -2379,8 +2387,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPBindClause(Kind, KindLoc, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPBindClause(Kind, KindLoc, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'ompx_dyn_cgroup_mem' clause. @@ -2390,8 +2398,8 @@ class TreeTransform { OMPClause *RebuildOMPXDynCGroupMemClause(Expr *Size, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPXDynCGroupMemClause(Size, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPXDynCGroupMemClause(Size, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'ompx_attribute' clause. @@ -2402,8 +2410,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPXAttributeClause(Attrs, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPXAttributeClause(Attrs, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'ompx_bare' clause. @@ -2412,7 +2420,7 @@ class TreeTransform { /// Subclasses may override this routine to provide different behavior. OMPClause *RebuildOMPXBareClause(SourceLocation StartLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPXBareClause(StartLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPXBareClause(StartLoc, EndLoc); } /// Build a new OpenMP 'align' clause. @@ -2422,7 +2430,8 @@ class TreeTransform { OMPClause *RebuildOMPAlignClause(Expr *A, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPAlignClause(A, StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPAlignClause(A, StartLoc, LParenLoc, + EndLoc); } /// Build a new OpenMP 'at' clause. @@ -2433,8 +2442,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPAtClause(Kind, KwLoc, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPAtClause(Kind, KwLoc, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'severity' clause. @@ -2446,8 +2455,8 @@ class TreeTransform { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPSeverityClause(Kind, KwLoc, StartLoc, LParenLoc, - EndLoc); + return getSema().OpenMP().ActOnOpenMPSeverityClause(Kind, KwLoc, StartLoc, + LParenLoc, EndLoc); } /// Build a new OpenMP 'message' clause. @@ -2457,7 +2466,8 @@ class TreeTransform { OMPClause *RebuildOMPMessageClause(Expr *MS, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPMessageClause(MS, StartLoc, LParenLoc, EndLoc); + return getSema().OpenMP().ActOnOpenMPMessageClause(MS, StartLoc, LParenLoc, + EndLoc); } /// Build a new OpenMP 'doacross' clause. @@ -2469,7 +2479,7 @@ class TreeTransform { SourceLocation DepLoc, SourceLocation ColonLoc, ArrayRef VarList, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { - return getSema().ActOnOpenMPDoacrossClause( + return getSema().OpenMP().ActOnOpenMPDoacrossClause( DepType, DepLoc, ColonLoc, VarList, StartLoc, LParenLoc, EndLoc); } @@ -2777,9 +2787,9 @@ class TreeTransform { SourceLocation ColonLocSecond, Expr *Length, Expr *Stride, SourceLocation RBracketLoc) { - return getSema().ActOnOMPArraySectionExpr(Base, LBracketLoc, LowerBound, - ColonLocFirst, ColonLocSecond, - Length, Stride, RBracketLoc); + return getSema().OpenMP().ActOnOMPArraySectionExpr( + Base, LBracketLoc, LowerBound, ColonLocFirst, ColonLocSecond, Length, + Stride, RBracketLoc); } /// Build a new array shaping expression. @@ -2790,19 +2800,20 @@ class TreeTransform { SourceLocation RParenLoc, ArrayRef Dims, ArrayRef BracketsRanges) { - return getSema().ActOnOMPArrayShapingExpr(Base, LParenLoc, RParenLoc, Dims, - BracketsRanges); + return getSema().OpenMP().ActOnOMPArrayShapingExpr( + Base, LParenLoc, RParenLoc, Dims, BracketsRanges); } /// Build a new iterator expression. /// /// By default, performs semantic analysis to build the new expression. /// Subclasses may override this routine to provide different behavior. - ExprResult RebuildOMPIteratorExpr( - SourceLocation IteratorKwLoc, SourceLocation LLoc, SourceLocation RLoc, - ArrayRef Data) { - return getSema().ActOnOMPIteratorExpr(/*Scope=*/nullptr, IteratorKwLoc, - LLoc, RLoc, Data); + ExprResult + RebuildOMPIteratorExpr(SourceLocation IteratorKwLoc, SourceLocation LLoc, + SourceLocation RLoc, + ArrayRef Data) { + return getSema().OpenMP().ActOnOMPIteratorExpr( + /*Scope=*/nullptr, IteratorKwLoc, LLoc, RLoc, Data); } /// Build a new call expression. @@ -8060,7 +8071,7 @@ template StmtResult TreeTransform::TransformForStmt(ForStmt *S) { if (getSema().getLangOpts().OpenMP) - getSema().startOpenMPLoop(); + getSema().OpenMP().startOpenMPLoop(); // Transform the initialization statement StmtResult Init = getDerived().TransformStmt(S->getInit()); @@ -8070,7 +8081,8 @@ TreeTransform::TransformForStmt(ForStmt *S) { // In OpenMP loop region loop control variable must be captured and be // private. Perform analysis of first part (if any). if (getSema().getLangOpts().OpenMP && Init.isUsable()) - getSema().ActOnOpenMPLoopInitialization(S->getForLoc(), Init.get()); + getSema().OpenMP().ActOnOpenMPLoopInitialization(S->getForLoc(), + Init.get()); // Transform the condition Sema::ConditionResult Cond = getDerived().TransformCondition( @@ -9029,9 +9041,9 @@ StmtResult TreeTransform::TransformOMPExecutableDirective( for (ArrayRef::iterator I = Clauses.begin(), E = Clauses.end(); I != E; ++I) { if (*I) { - getDerived().getSema().StartOpenMPClause((*I)->getClauseKind()); + getDerived().getSema().OpenMP().StartOpenMPClause((*I)->getClauseKind()); OMPClause *Clause = getDerived().TransformOMPClause(*I); - getDerived().getSema().EndOpenMPClause(); + getDerived().getSema().OpenMP().EndOpenMPClause(); if (Clause) TClauses.push_back(Clause); } else { @@ -9040,8 +9052,9 @@ StmtResult TreeTransform::TransformOMPExecutableDirective( } StmtResult AssociatedStmt; if (D->hasAssociatedStmt() && D->getAssociatedStmt()) { - getDerived().getSema().ActOnOpenMPRegionStart(D->getDirectiveKind(), - /*CurScope=*/nullptr); + getDerived().getSema().OpenMP().ActOnOpenMPRegionStart( + D->getDirectiveKind(), + /*CurScope=*/nullptr); StmtResult Body; { Sema::CompoundScopeRAII CompoundScope(getSema()); @@ -9059,7 +9072,7 @@ StmtResult TreeTransform::TransformOMPExecutableDirective( Body = getDerived().RebuildOMPCanonicalLoop(Body.get()); } AssociatedStmt = - getDerived().getSema().ActOnOpenMPRegionEnd(Body, TClauses); + getDerived().getSema().OpenMP().ActOnOpenMPRegionEnd(Body, TClauses); if (AssociatedStmt.isInvalid()) { return StmtError(); } @@ -9100,10 +9113,10 @@ template StmtResult TreeTransform::TransformOMPParallelDirective(OMPParallelDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_parallel, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9111,10 +9124,10 @@ template StmtResult TreeTransform::TransformOMPSimdDirective(OMPSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_simd, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9122,10 +9135,10 @@ template StmtResult TreeTransform::TransformOMPTileDirective(OMPTileDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(D->getDirectiveKind(), DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9133,10 +9146,10 @@ template StmtResult TreeTransform::TransformOMPUnrollDirective(OMPUnrollDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(D->getDirectiveKind(), DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9144,10 +9157,10 @@ template StmtResult TreeTransform::TransformOMPForDirective(OMPForDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_for, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_for, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9155,10 +9168,10 @@ template StmtResult TreeTransform::TransformOMPForSimdDirective(OMPForSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_for_simd, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_for_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9166,10 +9179,10 @@ template StmtResult TreeTransform::TransformOMPSectionsDirective(OMPSectionsDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_sections, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_sections, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9177,10 +9190,10 @@ template StmtResult TreeTransform::TransformOMPSectionDirective(OMPSectionDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_section, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_section, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9188,10 +9201,10 @@ template StmtResult TreeTransform::TransformOMPScopeDirective(OMPScopeDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_scope, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_scope, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9199,10 +9212,10 @@ template StmtResult TreeTransform::TransformOMPSingleDirective(OMPSingleDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_single, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_single, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9210,20 +9223,20 @@ template StmtResult TreeTransform::TransformOMPMasterDirective(OMPMasterDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_master, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_master, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } template StmtResult TreeTransform::TransformOMPCriticalDirective(OMPCriticalDirective *D) { - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_critical, D->getDirectiveName(), nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9231,10 +9244,10 @@ template StmtResult TreeTransform::TransformOMPParallelForDirective( OMPParallelForDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_for, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_parallel_for, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9242,10 +9255,10 @@ template StmtResult TreeTransform::TransformOMPParallelForSimdDirective( OMPParallelForSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_for_simd, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_parallel_for_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9253,10 +9266,10 @@ template StmtResult TreeTransform::TransformOMPParallelMasterDirective( OMPParallelMasterDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_master, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_parallel_master, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9264,10 +9277,10 @@ template StmtResult TreeTransform::TransformOMPParallelMaskedDirective( OMPParallelMaskedDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_masked, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_parallel_masked, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9275,10 +9288,10 @@ template StmtResult TreeTransform::TransformOMPParallelSectionsDirective( OMPParallelSectionsDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_sections, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_parallel_sections, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9286,10 +9299,10 @@ template StmtResult TreeTransform::TransformOMPTaskDirective(OMPTaskDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_task, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_task, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9297,10 +9310,10 @@ template StmtResult TreeTransform::TransformOMPTaskyieldDirective( OMPTaskyieldDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_taskyield, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_taskyield, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9308,10 +9321,10 @@ template StmtResult TreeTransform::TransformOMPBarrierDirective(OMPBarrierDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_barrier, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_barrier, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9319,10 +9332,10 @@ template StmtResult TreeTransform::TransformOMPTaskwaitDirective(OMPTaskwaitDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_taskwait, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_taskwait, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9330,10 +9343,10 @@ template StmtResult TreeTransform::TransformOMPErrorDirective(OMPErrorDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_error, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_error, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9341,10 +9354,10 @@ template StmtResult TreeTransform::TransformOMPTaskgroupDirective( OMPTaskgroupDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_taskgroup, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_taskgroup, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9352,10 +9365,10 @@ template StmtResult TreeTransform::TransformOMPFlushDirective(OMPFlushDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_flush, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_flush, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9363,10 +9376,10 @@ template StmtResult TreeTransform::TransformOMPDepobjDirective(OMPDepobjDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_depobj, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_depobj, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9374,10 +9387,10 @@ template StmtResult TreeTransform::TransformOMPScanDirective(OMPScanDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_scan, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_scan, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9385,10 +9398,10 @@ template StmtResult TreeTransform::TransformOMPOrderedDirective(OMPOrderedDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_ordered, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_ordered, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9396,10 +9409,10 @@ template StmtResult TreeTransform::TransformOMPAtomicDirective(OMPAtomicDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_atomic, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_atomic, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9407,10 +9420,10 @@ template StmtResult TreeTransform::TransformOMPTargetDirective(OMPTargetDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9418,10 +9431,10 @@ template StmtResult TreeTransform::TransformOMPTargetDataDirective( OMPTargetDataDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target_data, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target_data, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9429,10 +9442,10 @@ template StmtResult TreeTransform::TransformOMPTargetEnterDataDirective( OMPTargetEnterDataDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target_enter_data, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target_enter_data, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9440,10 +9453,10 @@ template StmtResult TreeTransform::TransformOMPTargetExitDataDirective( OMPTargetExitDataDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target_exit_data, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target_exit_data, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9451,10 +9464,10 @@ template StmtResult TreeTransform::TransformOMPTargetParallelDirective( OMPTargetParallelDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target_parallel, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target_parallel, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9462,10 +9475,10 @@ template StmtResult TreeTransform::TransformOMPTargetParallelForDirective( OMPTargetParallelForDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target_parallel_for, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target_parallel_for, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9473,10 +9486,10 @@ template StmtResult TreeTransform::TransformOMPTargetUpdateDirective( OMPTargetUpdateDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target_update, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target_update, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9484,10 +9497,10 @@ template StmtResult TreeTransform::TransformOMPTeamsDirective(OMPTeamsDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_teams, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_teams, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9495,10 +9508,10 @@ template StmtResult TreeTransform::TransformOMPCancellationPointDirective( OMPCancellationPointDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_cancellation_point, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_cancellation_point, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9506,10 +9519,10 @@ template StmtResult TreeTransform::TransformOMPCancelDirective(OMPCancelDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_cancel, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_cancel, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9517,10 +9530,10 @@ template StmtResult TreeTransform::TransformOMPTaskLoopDirective(OMPTaskLoopDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_taskloop, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_taskloop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9528,10 +9541,10 @@ template StmtResult TreeTransform::TransformOMPTaskLoopSimdDirective( OMPTaskLoopSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_taskloop_simd, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_taskloop_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9539,10 +9552,10 @@ template StmtResult TreeTransform::TransformOMPMasterTaskLoopDirective( OMPMasterTaskLoopDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_master_taskloop, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_master_taskloop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9550,10 +9563,10 @@ template StmtResult TreeTransform::TransformOMPMaskedTaskLoopDirective( OMPMaskedTaskLoopDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_masked_taskloop, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_masked_taskloop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9561,10 +9574,10 @@ template StmtResult TreeTransform::TransformOMPMasterTaskLoopSimdDirective( OMPMasterTaskLoopSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_master_taskloop_simd, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_master_taskloop_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9572,10 +9585,10 @@ template StmtResult TreeTransform::TransformOMPMaskedTaskLoopSimdDirective( OMPMaskedTaskLoopSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_masked_taskloop_simd, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_masked_taskloop_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9583,10 +9596,10 @@ template StmtResult TreeTransform::TransformOMPParallelMasterTaskLoopDirective( OMPParallelMasterTaskLoopDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_parallel_master_taskloop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9594,10 +9607,10 @@ template StmtResult TreeTransform::TransformOMPParallelMaskedTaskLoopDirective( OMPParallelMaskedTaskLoopDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_parallel_masked_taskloop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9606,10 +9619,10 @@ StmtResult TreeTransform::TransformOMPParallelMasterTaskLoopSimdDirective( OMPParallelMasterTaskLoopSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_parallel_master_taskloop_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9618,10 +9631,10 @@ StmtResult TreeTransform::TransformOMPParallelMaskedTaskLoopSimdDirective( OMPParallelMaskedTaskLoopSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_parallel_masked_taskloop_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9629,10 +9642,10 @@ template StmtResult TreeTransform::TransformOMPDistributeDirective( OMPDistributeDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_distribute, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_distribute, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9640,10 +9653,10 @@ template StmtResult TreeTransform::TransformOMPDistributeParallelForDirective( OMPDistributeParallelForDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_distribute_parallel_for, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9652,10 +9665,10 @@ StmtResult TreeTransform::TransformOMPDistributeParallelForSimdDirective( OMPDistributeParallelForSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_distribute_parallel_for_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9663,10 +9676,10 @@ template StmtResult TreeTransform::TransformOMPDistributeSimdDirective( OMPDistributeSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_distribute_simd, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_distribute_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9674,10 +9687,10 @@ template StmtResult TreeTransform::TransformOMPTargetParallelForSimdDirective( OMPTargetParallelForSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_target_parallel_for_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9685,10 +9698,10 @@ template StmtResult TreeTransform::TransformOMPTargetSimdDirective( OMPTargetSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target_simd, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9696,10 +9709,10 @@ template StmtResult TreeTransform::TransformOMPTeamsDistributeDirective( OMPTeamsDistributeDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_teams_distribute, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_teams_distribute, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9707,10 +9720,10 @@ template StmtResult TreeTransform::TransformOMPTeamsDistributeSimdDirective( OMPTeamsDistributeSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_teams_distribute_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9718,11 +9731,11 @@ template StmtResult TreeTransform::TransformOMPTeamsDistributeParallelForSimdDirective( OMPTeamsDistributeParallelForSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_teams_distribute_parallel_for_simd, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9730,10 +9743,10 @@ template StmtResult TreeTransform::TransformOMPTeamsDistributeParallelForDirective( OMPTeamsDistributeParallelForDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_teams_distribute_parallel_for, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9741,10 +9754,10 @@ template StmtResult TreeTransform::TransformOMPTargetTeamsDirective( OMPTargetTeamsDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target_teams, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target_teams, DirName, nullptr, D->getBeginLoc()); auto Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9752,10 +9765,10 @@ template StmtResult TreeTransform::TransformOMPTargetTeamsDistributeDirective( OMPTargetTeamsDistributeDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_target_teams_distribute, DirName, nullptr, D->getBeginLoc()); auto Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9764,11 +9777,11 @@ StmtResult TreeTransform::TransformOMPTargetTeamsDistributeParallelForDirective( OMPTargetTeamsDistributeParallelForDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_target_teams_distribute_parallel_for, DirName, nullptr, D->getBeginLoc()); auto Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9777,11 +9790,11 @@ StmtResult TreeTransform:: TransformOMPTargetTeamsDistributeParallelForSimdDirective( OMPTargetTeamsDistributeParallelForSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_target_teams_distribute_parallel_for_simd, DirName, nullptr, D->getBeginLoc()); auto Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9790,10 +9803,10 @@ StmtResult TreeTransform::TransformOMPTargetTeamsDistributeSimdDirective( OMPTargetTeamsDistributeSimdDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock( + getDerived().getSema().OpenMP().StartOpenMPDSABlock( OMPD_target_teams_distribute_simd, DirName, nullptr, D->getBeginLoc()); auto Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9801,10 +9814,10 @@ template StmtResult TreeTransform::TransformOMPInteropDirective(OMPInteropDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_interop, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_interop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9812,10 +9825,10 @@ template StmtResult TreeTransform::TransformOMPDispatchDirective(OMPDispatchDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_dispatch, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_dispatch, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9823,10 +9836,10 @@ template StmtResult TreeTransform::TransformOMPMaskedDirective(OMPMaskedDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_masked, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_masked, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9834,10 +9847,10 @@ template StmtResult TreeTransform::TransformOMPGenericLoopDirective( OMPGenericLoopDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_loop, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_loop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9845,10 +9858,10 @@ template StmtResult TreeTransform::TransformOMPTeamsGenericLoopDirective( OMPTeamsGenericLoopDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_teams_loop, DirName, nullptr, - D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_teams_loop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9856,10 +9869,10 @@ template StmtResult TreeTransform::TransformOMPTargetTeamsGenericLoopDirective( OMPTargetTeamsGenericLoopDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target_teams_loop, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target_teams_loop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9867,10 +9880,10 @@ template StmtResult TreeTransform::TransformOMPParallelGenericLoopDirective( OMPParallelGenericLoopDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_loop, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_parallel_loop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -9879,10 +9892,10 @@ StmtResult TreeTransform::TransformOMPTargetParallelGenericLoopDirective( OMPTargetParallelGenericLoopDirective *D) { DeclarationNameInfo DirName; - getDerived().getSema().StartOpenMPDSABlock(OMPD_target_parallel_loop, DirName, - nullptr, D->getBeginLoc()); + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + OMPD_target_parallel_loop, DirName, nullptr, D->getBeginLoc()); StmtResult Res = getDerived().TransformOMPExecutableDirective(D); - getDerived().getSema().EndOpenMPDSABlock(Res.get()); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); return Res; } @@ -10972,7 +10985,7 @@ TreeTransform::TransformOMPExclusiveClause(OMPExclusiveClause *C) { template OMPClause *TreeTransform::TransformOMPUsesAllocatorsClause( OMPUsesAllocatorsClause *C) { - SmallVector Data; + SmallVector Data; Data.reserve(C->getNumberOfAllocators()); for (unsigned I = 0, E = C->getNumberOfAllocators(); I < E; ++I) { OMPUsesAllocatorsClause::Data D = C->getAllocatorData(I); @@ -10985,7 +10998,7 @@ OMPClause *TreeTransform::TransformOMPUsesAllocatorsClause( if (AllocatorTraits.isInvalid()) continue; } - Sema::UsesAllocatorsData &NewD = Data.emplace_back(); + SemaOpenMP::UsesAllocatorsData &NewD = Data.emplace_back(); NewD.Allocator = Allocator.get(); NewD.AllocatorTraits = AllocatorTraits.get(); NewD.LParenLoc = D.LParenLoc; @@ -11667,7 +11680,7 @@ template ExprResult TreeTransform::TransformOMPIteratorExpr(OMPIteratorExpr *E) { unsigned NumIterators = E->numOfIterators(); - SmallVector Data(NumIterators); + SmallVector Data(NumIterators); bool ErrorFound = false; bool NeedToRebuild = getDerived().AlwaysRebuild(); @@ -11802,7 +11815,8 @@ TreeTransform::TransformMemberExpr(MemberExpr *E) { // Skip for member expression of (this->f), rebuilt thisi->f is needed // for Openmp where the field need to be privatizized in the case. if (!(isa(E->getBase()) && - getSema().isOpenMPRebuildMemberExpr(cast(Member)))) { + getSema().OpenMP().isOpenMPRebuildMemberExpr( + cast(Member)))) { // Mark it referenced in the new context regardless. // FIXME: this is a bit instantiation-specific. SemaRef.MarkMemberReferenced(E); From 9e7aab951ffba0211193ceb435c6b49e4e19ac24 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 16 Apr 2024 07:43:13 -0500 Subject: [PATCH 096/300] [CUDA] Rename SM_32 to SM_32_ to work around AIX headers (#88779) Summary: AIX headers define this, so we need to work around it. In the future this will be removed but for now we should just rename it to avoid these issues. --- clang/include/clang/Basic/Cuda.h | 8 +++----- clang/lib/Basic/Cuda.cpp | 6 +++--- clang/lib/Basic/Targets/NVPTX.cpp | 2 +- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 +- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 38f30543a0f662..ba0e4465a0f5a0 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -50,17 +50,15 @@ const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" CudaVersion CudaStringToVersion(const llvm::Twine &S); -// We have a name conflict with sys/mac.h on AIX -#ifdef SM_32 -#undef SM_32 -#endif enum class CudaArch { UNUSED, UNKNOWN, + // TODO: Deprecate and remove GPU architectures older than sm_52. SM_20, SM_21, SM_30, - SM_32, + // This has a name conflict with sys/mac.h on AIX, rename it as a workaround. + SM_32_, SM_35, SM_37, SM_50, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 1b1da6a1356f2c..113483db5729b0 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -86,7 +86,7 @@ static const CudaArchToStringMap arch_names[] = { // clang-format off {CudaArch::UNUSED, "", ""}, SM2(20, "compute_20"), SM2(21, "compute_20"), // Fermi - SM(30), SM(32), SM(35), SM(37), // Kepler + SM(30), {CudaArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37), // Kepler SM(50), SM(52), SM(53), // Maxwell SM(60), SM(61), SM(62), // Pascal SM(70), SM(72), // Volta @@ -186,7 +186,7 @@ CudaVersion MinVersionForCudaArch(CudaArch A) { case CudaArch::SM_20: case CudaArch::SM_21: case CudaArch::SM_30: - case CudaArch::SM_32: + case CudaArch::SM_32_: case CudaArch::SM_35: case CudaArch::SM_37: case CudaArch::SM_50: @@ -231,7 +231,7 @@ CudaVersion MaxVersionForCudaArch(CudaArch A) { case CudaArch::SM_21: return CudaVersion::CUDA_80; case CudaArch::SM_30: - case CudaArch::SM_32: + case CudaArch::SM_32_: return CudaVersion::CUDA_102; case CudaArch::SM_35: case CudaArch::SM_37: diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index b47c399fef6042..8ad9e6e5f58916 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -239,7 +239,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, return "210"; case CudaArch::SM_30: return "300"; - case CudaArch::SM_32: + case CudaArch::SM_32_: return "320"; case CudaArch::SM_35: return "350"; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 59ba03c6b86253..eb716520e5ff56 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -3466,7 +3466,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective( case CudaArch::SM_20: case CudaArch::SM_21: case CudaArch::SM_30: - case CudaArch::SM_32: + case CudaArch::SM_32_: case CudaArch::SM_35: case CudaArch::SM_37: case CudaArch::SM_50: From e7fb49c24e4be4780ee4df9829980c5e8ddd511e Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Tue, 16 Apr 2024 09:00:57 -0400 Subject: [PATCH 097/300] Switch release notes links to using markup for github issues; NFC --- clang/docs/ReleaseNotes.rst | 43 +++++++++++++------------------------ 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 255d2cc0440438..db90db6fa4ab0e 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -104,8 +104,7 @@ C++20 Feature Support - Clang now implements [module.import]p7 fully. Clang now will import module units transitively for the module units coming from the same module of the - current module units. - Fixes `#84002 `_. + current module units. Fixes #GH84002 - Initial support for class template argument deduction (CTAD) for type alias templates (`P1814R0 `_). @@ -135,8 +134,7 @@ C++2c Feature Support Resolutions to C++ Defect Reports ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - Substitute template parameter pack, when it is not explicitly specified - in the template parameters, but is deduced from a previous argument. - (`#78449: `_). + in the template parameters, but is deduced from a previous argument. (#GH78449) - Type qualifications are now ignored when evaluating layout compatibility of two types. @@ -176,8 +174,7 @@ C23 Feature Support - Clang now generates predefined macros of the form ``__TYPE_FMTB__`` and ``__TYPE_FMTb__`` (e.g., ``__UINT_FAST64_FMTB__``) in C23 mode for use with - macros typically exposed from ````, such as ``PRIb8``. - (`#81896: `_). + macros typically exposed from ````, such as ``PRIb8``. (#GH81896) - Clang now supports `N3018 The constexpr specifier for object definitions` `_. @@ -215,7 +212,7 @@ New Compiler Flags - ``-Wmissing-designated-field-initializers``, grouped under ``-Wmissing-field-initializers``. This diagnostic can be disabled to make ``-Wmissing-field-initializers`` behave - like it did before Clang 18.x. Fixes (`#56628 `_) + like it did before Clang 18.x. Fixes #GH56628 Deprecated Compiler Flags ------------------------- @@ -254,8 +251,7 @@ Removed Compiler Flags - The ``-freroll-loops`` flag has been removed. It had no effect since Clang 13. - ``-m[no-]unaligned-access`` is removed for RISC-V and LoongArch. - ``-m[no-]strict-align``, also supported by GCC, should be used instead. - (`#85350 `_.) + ``-m[no-]strict-align``, also supported by GCC, should be used instead. (#GH85350) Attribute Changes in Clang -------------------------- @@ -325,8 +321,7 @@ Improvements to Clang's diagnostics Fixes #GH82512. - Clang now provides improved warnings for the ``cleanup`` attribute to detect misuse scenarios, - such as attempting to call ``free`` on an unallocated object. Fixes - `#79443 `_. + such as attempting to call ``free`` on an unallocated object. Fixes #GH79443. - Clang no longer warns when the ``bitand`` operator is used with boolean operands, distinguishing it from potential typographical errors or unintended @@ -372,11 +367,10 @@ Improvements to Clang's time-trace Bug Fixes in This Version ------------------------- - Clang's ``-Wundefined-func-template`` no longer warns on pure virtual - functions. - (`#74016 `_) + functions. (#GH74016) - Fixed missing warnings when comparing mismatched enumeration constants - in C (`#29217 `). + in C (#GH29217) - Clang now accepts elaborated-type-specifiers that explicitly specialize a member class template for an implicit instantiation of a class template. @@ -415,7 +409,7 @@ Bug Fixes in This Version type only rather than to the complex type (e.g. ``_Complex float / int`` is now evaluated as ``_Complex float / float`` rather than ``_Complex float / _Complex float``), as mandated by the C standard. This significantly improves codegen of `*` and `/` especially. - Fixes (`#31205 `_). + Fixes #GH31205. - Fixes an assertion failure on invalid code when trying to define member functions in lambdas. @@ -464,8 +458,7 @@ Bug Fixes to C++ Support - Fix a crash when trying to call a varargs function that also has an explicit object parameter. (#GH80971) - Fixed a bug where abbreviated function templates would append their invented template parameters to an empty template parameter lists. -- Fix parsing of abominable function types inside type traits. - Fixes (`#77585 `_) +- Fix parsing of abominable function types inside type traits. Fixes #GH77585 - Clang now classifies aggregate initialization in C++17 and newer as constant or non-constant more accurately. Previously, only a subset of the initializer elements were considered, misclassifying some initializers as constant. Partially fixes @@ -506,9 +499,7 @@ Bug Fixes to C++ Support - Fix a bug where overload resolution falsely reported an ambiguity when it was comparing a member-function against a non member function or a member-function with an explicit object parameter against a member function with no explicit object parameter - when one of the function had more specialized templates. - Fixes (`#82509 `_) - and (`#74494 `_) + when one of the function had more specialized templates. Fixes #GH82509 and #GH74494 - Clang now supports direct lambda calls inside of a type alias template declarations. This addresses (#GH70601), (#GH76674), (#GH79555), (#GH81145) and (#GH82104). - Allow access to a public template alias declaration that refers to friend's @@ -530,8 +521,7 @@ Bug Fixes to C++ Support - Fixed a bug that prevented member function templates of class templates declared with a deduced return type from being explicitly specialized for a given implicit instantiation of the class template. -- Fix crash when inheriting from a cv-qualified type. Fixes: - (`#35603 `_) +- Fix crash when inheriting from a cv-qualified type. Fixes #GH35603 - Fix a crash when the using enum declaration uses an anonymous enumeration. Fixes (#GH86790). - Handled an edge case in ``getFullyPackExpandedSize`` so that we now avoid a false-positive diagnostic. (#GH84220) - Clang now correctly tracks type dependence of by-value captures in lambdas with an explicit @@ -539,8 +529,7 @@ Bug Fixes to C++ Support Fixes (#GH70604), (#GH79754), (#GH84163), (#GH84425), (#GH86054), (#GH86398), and (#GH86399). - Fix a crash when deducing ``auto`` from an invalid dereference (#GH88329). - Fix a crash in requires expression with templated base class member function. Fixes (#GH84020). -- placement new initializes typedef array with correct size - (`#GH41441 `_) +- Placement new initializes typedef array with correct size (#GH41441) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -554,8 +543,7 @@ Miscellaneous Clang Crashes Fixed ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - Do not attempt to dump the layout of dependent types or invalid declarations - when ``-fdump-record-layouts-complete`` is passed. - Fixes (`#83684 `_). + when ``-fdump-record-layouts-complete`` is passed. Fixes #GH83684. OpenACC Specific Changes ------------------------ @@ -605,8 +593,7 @@ Windows Support would only be included if AVX was enabled at compile time. This was done to work around include times from MSVC STL including ``intrin.h`` under clang-cl. Clang-cl now provides ``intrin0.h`` for MSVC STL and therefore all intrinsic - features without requiring enablement at compile time. - Fixes: (`#53520 `_) + features without requiring enablement at compile time. Fixes #GH53520 - Improved compile times with MSVC STL. MSVC provides ``intrin0.h`` which is a header that only includes intrinsics that are used by MSVC STL to avoid the From 9d8be2408768912dc113a342050049231e4fc8d1 Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Tue, 16 Apr 2024 15:30:32 +0200 Subject: [PATCH 098/300] Revert "[codegen] Emit missing cleanups for stmt-expr and coro suspensions" and related commits (#88884) The original change caused widespread breakages in msan/ubsan tests and causes `use-after-free`. Most likely we are adding more cleanups than necessary. --- clang/lib/CodeGen/CGCall.cpp | 13 +- clang/lib/CodeGen/CGCleanup.cpp | 49 +-- clang/lib/CodeGen/CGCleanup.h | 57 +-- clang/lib/CodeGen/CGDecl.cpp | 61 +-- clang/lib/CodeGen/CGExpr.cpp | 12 +- clang/lib/CodeGen/CGExprAgg.cpp | 87 ++-- clang/lib/CodeGen/CGExprCXX.cpp | 38 +- clang/lib/CodeGen/CodeGenFunction.cpp | 6 - clang/lib/CodeGen/CodeGenFunction.h | 99 +---- .../CodeGenCXX/control-flow-in-stmt-expr.cpp | 409 ------------------ .../coro-suspend-cleanups.cpp | 93 ---- 11 files changed, 128 insertions(+), 796 deletions(-) delete mode 100644 clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp delete mode 100644 clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 0c860a3ccbd2f0..7a0bc6fa77b889 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -4694,11 +4694,11 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E, AggValueSlot Slot = args.isUsingInAlloca() ? createPlaceholderSlot(*this, type) : CreateAggTemp(type, "agg.tmp"); - bool DestroyedInCallee = true, NeedsCleanup = true; + bool DestroyedInCallee = true, NeedsEHCleanup = true; if (const auto *RD = type->getAsCXXRecordDecl()) DestroyedInCallee = RD->hasNonTrivialDestructor(); else - NeedsCleanup = type.isDestructedType(); + NeedsEHCleanup = needsEHCleanup(type.isDestructedType()); if (DestroyedInCallee) Slot.setExternallyDestructed(); @@ -4707,15 +4707,14 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E, RValue RV = Slot.asRValue(); args.add(RV, type); - if (DestroyedInCallee && NeedsCleanup) { + if (DestroyedInCallee && NeedsEHCleanup) { // Create a no-op GEP between the placeholder and the cleanup so we can // RAUW it successfully. It also serves as a marker of the first // instruction where the cleanup is active. - pushFullExprCleanup(NormalAndEHCleanup, - Slot.getAddress(), type); + pushFullExprCleanup(EHCleanup, Slot.getAddress(), + type); // This unreachable is a temporary marker which will be removed later. - llvm::Instruction *IsActive = - Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy)); + llvm::Instruction *IsActive = Builder.CreateUnreachable(); args.addArgCleanupDeactivation(EHStack.stable_begin(), IsActive); } return; diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp index 8683f19d9da28e..e6f8e6873004f2 100644 --- a/clang/lib/CodeGen/CGCleanup.cpp +++ b/clang/lib/CodeGen/CGCleanup.cpp @@ -634,19 +634,12 @@ static void destroyOptimisticNormalEntry(CodeGenFunction &CGF, /// Pops a cleanup block. If the block includes a normal cleanup, the /// current insertion point is threaded through the cleanup, as are /// any branch fixups on the cleanup. -void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, - bool ForDeactivation) { +void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) { assert(!EHStack.empty() && "cleanup stack is empty!"); assert(isa(*EHStack.begin()) && "top not a cleanup!"); EHCleanupScope &Scope = cast(*EHStack.begin()); assert(Scope.getFixupDepth() <= EHStack.getNumBranchFixups()); - // If we are deactivating a normal cleanup, we need to pretend that the - // fallthrough is unreachable. We restore this IP before returning. - CGBuilderTy::InsertPoint NormalDeactivateOrigIP; - if (ForDeactivation && (Scope.isNormalCleanup() || !getLangOpts().EHAsynch)) { - NormalDeactivateOrigIP = Builder.saveAndClearIP(); - } // Remember activation information. bool IsActive = Scope.isActive(); Address NormalActiveFlag = @@ -674,8 +667,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, // - whether there's a fallthrough llvm::BasicBlock *FallthroughSource = Builder.GetInsertBlock(); - bool HasFallthrough = - FallthroughSource != nullptr && (IsActive || HasExistingBranches); + bool HasFallthrough = (FallthroughSource != nullptr && IsActive); // Branch-through fall-throughs leave the insertion point set to the // end of the last cleanup, which points to the current scope. The @@ -700,11 +692,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, // If we have a prebranched fallthrough into an inactive normal // cleanup, rewrite it so that it leads to the appropriate place. - if (Scope.isNormalCleanup() && HasPrebranchedFallthrough && - !RequiresNormalCleanup) { - // FIXME: Come up with a program which would need forwarding prebranched - // fallthrough and add tests. Otherwise delete this and assert against it. - assert(!IsActive); + if (Scope.isNormalCleanup() && HasPrebranchedFallthrough && !IsActive) { llvm::BasicBlock *prebranchDest; // If the prebranch is semantically branching through the next @@ -736,8 +724,6 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, EHStack.popCleanup(); // safe because there are no fixups assert(EHStack.getNumBranchFixups() == 0 || EHStack.hasNormalCleanups()); - if (NormalDeactivateOrigIP.isSet()) - Builder.restoreIP(NormalDeactivateOrigIP); return; } @@ -774,19 +760,11 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, if (!RequiresNormalCleanup) { // Mark CPP scope end for passed-by-value Arg temp // per Windows ABI which is "normally" Cleanup in callee - if (IsEHa && getInvokeDest()) { - // If we are deactivating a normal cleanup then we don't have a - // fallthrough. Restore original IP to emit CPP scope ends in the correct - // block. - if (NormalDeactivateOrigIP.isSet()) - Builder.restoreIP(NormalDeactivateOrigIP); - if (Personality.isMSVCXXPersonality() && Builder.GetInsertBlock()) + if (IsEHa && getInvokeDest() && Builder.GetInsertBlock()) { + if (Personality.isMSVCXXPersonality()) EmitSehCppScopeEnd(); - if (NormalDeactivateOrigIP.isSet()) - NormalDeactivateOrigIP = Builder.saveAndClearIP(); } destroyOptimisticNormalEntry(*this, Scope); - Scope.MarkEmitted(); EHStack.popCleanup(); } else { // If we have a fallthrough and no other need for the cleanup, @@ -803,7 +781,6 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, } destroyOptimisticNormalEntry(*this, Scope); - Scope.MarkEmitted(); EHStack.popCleanup(); EmitCleanup(*this, Fn, cleanupFlags, NormalActiveFlag); @@ -939,7 +916,6 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, } // IV. Pop the cleanup and emit it. - Scope.MarkEmitted(); EHStack.popCleanup(); assert(EHStack.hasNormalCleanups() == HasEnclosingCleanups); @@ -1008,8 +984,6 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough, } } - if (NormalDeactivateOrigIP.isSet()) - Builder.restoreIP(NormalDeactivateOrigIP); assert(EHStack.hasNormalCleanups() || EHStack.getNumBranchFixups() == 0); // Emit the EH cleanup if required. @@ -1299,8 +1273,17 @@ void CodeGenFunction::DeactivateCleanupBlock(EHScopeStack::stable_iterator C, // to the current RunCleanupsScope. if (C == EHStack.stable_begin() && CurrentCleanupScopeDepth.strictlyEncloses(C)) { - PopCleanupBlock(/*FallthroughIsBranchThrough=*/false, - /*ForDeactivation=*/true); + // Per comment below, checking EHAsynch is not really necessary + // it's there to assure zero-impact w/o EHAsynch option + if (!Scope.isNormalCleanup() && getLangOpts().EHAsynch) { + PopCleanupBlock(); + } else { + // If it's a normal cleanup, we need to pretend that the + // fallthrough is unreachable. + CGBuilderTy::InsertPoint SavedIP = Builder.saveAndClearIP(); + PopCleanupBlock(); + Builder.restoreIP(SavedIP); + } return; } diff --git a/clang/lib/CodeGen/CGCleanup.h b/clang/lib/CodeGen/CGCleanup.h index c73c97146abc4d..03e4a29d7b3dbf 100644 --- a/clang/lib/CodeGen/CGCleanup.h +++ b/clang/lib/CodeGen/CGCleanup.h @@ -16,11 +16,8 @@ #include "EHScopeStack.h" #include "Address.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Instruction.h" namespace llvm { class BasicBlock; @@ -269,51 +266,6 @@ class alignas(8) EHCleanupScope : public EHScope { }; mutable struct ExtInfo *ExtInfo; - /// Erases auxillary allocas and their usages for an unused cleanup. - /// Cleanups should mark these allocas as 'used' if the cleanup is - /// emitted, otherwise these instructions would be erased. - struct AuxillaryAllocas { - SmallVector AuxAllocas; - bool used = false; - - // Records a potentially unused instruction to be erased later. - void Add(llvm::AllocaInst *Alloca) { AuxAllocas.push_back(Alloca); } - - // Mark all recorded instructions as used. These will not be erased later. - void MarkUsed() { - used = true; - AuxAllocas.clear(); - } - - ~AuxillaryAllocas() { - if (used) - return; - llvm::SetVector Uses; - for (auto *Inst : llvm::reverse(AuxAllocas)) - CollectUses(Inst, Uses); - // Delete uses in the reverse order of insertion. - for (auto *I : llvm::reverse(Uses)) - I->eraseFromParent(); - } - - private: - void CollectUses(llvm::Instruction *I, - llvm::SetVector &Uses) { - if (!I || !Uses.insert(I)) - return; - for (auto *User : I->users()) - CollectUses(cast(User), Uses); - } - }; - mutable struct AuxillaryAllocas *AuxAllocas; - - AuxillaryAllocas &getAuxillaryAllocas() { - if (!AuxAllocas) { - AuxAllocas = new struct AuxillaryAllocas(); - } - return *AuxAllocas; - } - /// The number of fixups required by enclosing scopes (not including /// this one). If this is the top cleanup scope, all the fixups /// from this index onwards belong to this scope. @@ -346,7 +298,7 @@ class alignas(8) EHCleanupScope : public EHScope { EHScopeStack::stable_iterator enclosingEH) : EHScope(EHScope::Cleanup, enclosingEH), EnclosingNormal(enclosingNormal), NormalBlock(nullptr), - ActiveFlag(Address::invalid()), ExtInfo(nullptr), AuxAllocas(nullptr), + ActiveFlag(Address::invalid()), ExtInfo(nullptr), FixupDepth(fixupDepth) { CleanupBits.IsNormalCleanup = isNormal; CleanupBits.IsEHCleanup = isEH; @@ -360,15 +312,8 @@ class alignas(8) EHCleanupScope : public EHScope { } void Destroy() { - if (AuxAllocas) - delete AuxAllocas; delete ExtInfo; } - void AddAuxAllocas(llvm::SmallVector Allocas) { - for (auto *Alloca : Allocas) - getAuxillaryAllocas().Add(Alloca); - } - void MarkEmitted() { getAuxillaryAllocas().MarkUsed(); } // Objects of EHCleanupScope are not destructed. Use Destroy(). ~EHCleanupScope() = delete; diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index 3f05ebb561da57..ce6d6d8956076e 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -19,7 +19,6 @@ #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "ConstantEmitter.h" -#include "EHScopeStack.h" #include "PatternInit.h" #include "TargetInfo.h" #include "clang/AST/ASTContext.h" @@ -2202,27 +2201,6 @@ void CodeGenFunction::pushDestroy(CleanupKind cleanupKind, Address addr, destroyer, useEHCleanupForArray); } -// Pushes a destroy and defers its deactivation until its -// CleanupDeactivationScope is exited. -void CodeGenFunction::pushDestroyAndDeferDeactivation( - QualType::DestructionKind dtorKind, Address addr, QualType type) { - assert(dtorKind && "cannot push destructor for trivial type"); - - CleanupKind cleanupKind = getCleanupKind(dtorKind); - pushDestroyAndDeferDeactivation( - cleanupKind, addr, type, getDestroyer(dtorKind), cleanupKind & EHCleanup); -} - -void CodeGenFunction::pushDestroyAndDeferDeactivation( - CleanupKind cleanupKind, Address addr, QualType type, Destroyer *destroyer, - bool useEHCleanupForArray) { - llvm::Instruction *DominatingIP = - Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy)); - pushDestroy(cleanupKind, addr, type, destroyer, useEHCleanupForArray); - DeferredDeactivationCleanupStack.push_back( - {EHStack.stable_begin(), DominatingIP}); -} - void CodeGenFunction::pushStackRestore(CleanupKind Kind, Address SPMem) { EHStack.pushCleanup(Kind, SPMem); } @@ -2239,19 +2217,16 @@ void CodeGenFunction::pushLifetimeExtendedDestroy(CleanupKind cleanupKind, // If we're not in a conditional branch, we don't need to bother generating a // conditional cleanup. if (!isInConditionalBranch()) { + // Push an EH-only cleanup for the object now. // FIXME: When popping normal cleanups, we need to keep this EH cleanup // around in case a temporary's destructor throws an exception. + if (cleanupKind & EHCleanup) + EHStack.pushCleanup( + static_cast(cleanupKind & ~NormalCleanup), addr, type, + destroyer, useEHCleanupForArray); - // Add the cleanup to the EHStack. After the full-expr, this would be - // deactivated before being popped from the stack. - pushDestroyAndDeferDeactivation(cleanupKind, addr, type, destroyer, - useEHCleanupForArray); - - // Since this is lifetime-extended, push it once again to the EHStack after - // the full expression. return pushCleanupAfterFullExprWithActiveFlag( - cleanupKind, Address::invalid(), addr, type, destroyer, - useEHCleanupForArray); + cleanupKind, Address::invalid(), addr, type, destroyer, useEHCleanupForArray); } // Otherwise, we should only destroy the object if it's been initialized. @@ -2266,12 +2241,13 @@ void CodeGenFunction::pushLifetimeExtendedDestroy(CleanupKind cleanupKind, Address ActiveFlag = createCleanupActiveFlag(); SavedType SavedAddr = saveValueInCond(addr); - pushCleanupAndDeferDeactivation( - cleanupKind, SavedAddr, type, destroyer, useEHCleanupForArray); - initFullExprCleanupWithFlag(ActiveFlag); + if (cleanupKind & EHCleanup) { + EHStack.pushCleanup( + static_cast(cleanupKind & ~NormalCleanup), SavedAddr, type, + destroyer, useEHCleanupForArray); + initFullExprCleanupWithFlag(ActiveFlag); + } - // Since this is lifetime-extended, push it once again to the EHStack after - // the full expression. pushCleanupAfterFullExprWithActiveFlag( cleanupKind, ActiveFlag, SavedAddr, type, destroyer, useEHCleanupForArray); @@ -2466,9 +2442,9 @@ namespace { }; } // end anonymous namespace -/// pushIrregularPartialArrayCleanup - Push a NormalAndEHCleanup to -/// destroy already-constructed elements of the given array. The cleanup may be -/// popped with DeactivateCleanupBlock or PopCleanupBlock. +/// pushIrregularPartialArrayCleanup - Push an EH cleanup to destroy +/// already-constructed elements of the given array. The cleanup +/// may be popped with DeactivateCleanupBlock or PopCleanupBlock. /// /// \param elementType - the immediate element type of the array; /// possibly still an array type @@ -2477,9 +2453,10 @@ void CodeGenFunction::pushIrregularPartialArrayCleanup(llvm::Value *arrayBegin, QualType elementType, CharUnits elementAlign, Destroyer *destroyer) { - pushFullExprCleanup( - NormalAndEHCleanup, arrayBegin, arrayEndPointer, elementType, - elementAlign, destroyer); + pushFullExprCleanup(EHCleanup, + arrayBegin, arrayEndPointer, + elementType, elementAlign, + destroyer); } /// pushRegularPartialArrayCleanup - Push an EH cleanup to destroy diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index c85a339f5e3f88..cf696a1c9f560f 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -115,16 +115,10 @@ RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, CharUnits Align, llvm::AllocaInst *CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, const Twine &Name, llvm::Value *ArraySize) { - llvm::AllocaInst *Alloca; if (ArraySize) - Alloca = Builder.CreateAlloca(Ty, ArraySize, Name); - else - Alloca = new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(), - ArraySize, Name, AllocaInsertPt); - if (Allocas) { - Allocas->Add(Alloca); - } - return Alloca; + return Builder.CreateAlloca(Ty, ArraySize, Name); + return new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(), + ArraySize, Name, AllocaInsertPt); } /// CreateDefaultAlignTempAlloca - This creates an alloca with the diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp index 560a9e2c5ead5c..1b9287ea239347 100644 --- a/clang/lib/CodeGen/CGExprAgg.cpp +++ b/clang/lib/CodeGen/CGExprAgg.cpp @@ -15,7 +15,6 @@ #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "ConstantEmitter.h" -#include "EHScopeStack.h" #include "TargetInfo.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Attr.h" @@ -25,7 +24,6 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Instruction.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" using namespace clang; @@ -560,27 +558,24 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType, // For that, we'll need an EH cleanup. QualType::DestructionKind dtorKind = elementType.isDestructedType(); Address endOfInit = Address::invalid(); - CodeGenFunction::CleanupDeactivationScope deactivation(CGF); - - if (dtorKind) { - CodeGenFunction::AllocaTrackerRAII allocaTracker(CGF); + EHScopeStack::stable_iterator cleanup; + llvm::Instruction *cleanupDominator = nullptr; + if (CGF.needsEHCleanup(dtorKind)) { // In principle we could tell the cleanup where we are more // directly, but the control flow can get so varied here that it // would actually be quite complex. Therefore we go through an // alloca. - llvm::Instruction *dominatingIP = - Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(CGF.Int8PtrTy)); endOfInit = CGF.CreateTempAlloca(begin->getType(), CGF.getPointerAlign(), "arrayinit.endOfInit"); - Builder.CreateStore(begin, endOfInit); + cleanupDominator = Builder.CreateStore(begin, endOfInit); CGF.pushIrregularPartialArrayCleanup(begin, endOfInit, elementType, elementAlign, CGF.getDestroyer(dtorKind)); - cast(*CGF.EHStack.find(CGF.EHStack.stable_begin())) - .AddAuxAllocas(allocaTracker.Take()); + cleanup = CGF.EHStack.stable_begin(); - CGF.DeferredDeactivationCleanupStack.push_back( - {CGF.EHStack.stable_begin(), dominatingIP}); + // Otherwise, remember that we didn't need a cleanup. + } else { + dtorKind = QualType::DK_none; } llvm::Value *one = llvm::ConstantInt::get(CGF.SizeTy, 1); @@ -676,6 +671,9 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType, CGF.EmitBlock(endBB); } + + // Leave the partial-array cleanup if we entered one. + if (dtorKind) CGF.DeactivateCleanupBlock(cleanup, cleanupDominator); } //===----------------------------------------------------------------------===// @@ -1376,8 +1374,9 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) { LValue SlotLV = CGF.MakeAddrLValue(Slot.getAddress(), E->getType()); // We'll need to enter cleanup scopes in case any of the element - // initializers throws an exception or contains branch out of the expressions. - CodeGenFunction::CleanupDeactivationScope scope(CGF); + // initializers throws an exception. + SmallVector Cleanups; + llvm::Instruction *CleanupDominator = nullptr; CXXRecordDecl::field_iterator CurField = E->getLambdaClass()->field_begin(); for (LambdaExpr::const_capture_init_iterator i = E->capture_init_begin(), @@ -1396,12 +1395,28 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) { if (QualType::DestructionKind DtorKind = CurField->getType().isDestructedType()) { assert(LV.isSimple()); - if (DtorKind) - CGF.pushDestroyAndDeferDeactivation( - NormalAndEHCleanup, LV.getAddress(CGF), CurField->getType(), - CGF.getDestroyer(DtorKind), false); + if (CGF.needsEHCleanup(DtorKind)) { + if (!CleanupDominator) + CleanupDominator = CGF.Builder.CreateAlignedLoad( + CGF.Int8Ty, + llvm::Constant::getNullValue(CGF.Int8PtrTy), + CharUnits::One()); // placeholder + + CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), CurField->getType(), + CGF.getDestroyer(DtorKind), false); + Cleanups.push_back(CGF.EHStack.stable_begin()); + } } } + + // Deactivate all the partial cleanups in reverse order, which + // generally means popping them. + for (unsigned i = Cleanups.size(); i != 0; --i) + CGF.DeactivateCleanupBlock(Cleanups[i-1], CleanupDominator); + + // Destroy the placeholder if we made one. + if (CleanupDominator) + CleanupDominator->eraseFromParent(); } void AggExprEmitter::VisitExprWithCleanups(ExprWithCleanups *E) { @@ -1690,7 +1705,14 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr( // We'll need to enter cleanup scopes in case any of the element // initializers throws an exception. SmallVector cleanups; - CodeGenFunction::CleanupDeactivationScope DeactivateCleanups(CGF); + llvm::Instruction *cleanupDominator = nullptr; + auto addCleanup = [&](const EHScopeStack::stable_iterator &cleanup) { + cleanups.push_back(cleanup); + if (!cleanupDominator) // create placeholder once needed + cleanupDominator = CGF.Builder.CreateAlignedLoad( + CGF.Int8Ty, llvm::Constant::getNullValue(CGF.Int8PtrTy), + CharUnits::One()); + }; unsigned curInitIndex = 0; @@ -1713,8 +1735,10 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr( CGF.EmitAggExpr(InitExprs[curInitIndex++], AggSlot); if (QualType::DestructionKind dtorKind = - Base.getType().isDestructedType()) - CGF.pushDestroyAndDeferDeactivation(dtorKind, V, Base.getType()); + Base.getType().isDestructedType()) { + CGF.pushDestroy(dtorKind, V, Base.getType()); + addCleanup(CGF.EHStack.stable_begin()); + } } } @@ -1789,10 +1813,10 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr( if (QualType::DestructionKind dtorKind = field->getType().isDestructedType()) { assert(LV.isSimple()); - if (dtorKind) { - CGF.pushDestroyAndDeferDeactivation( - NormalAndEHCleanup, LV.getAddress(CGF), field->getType(), - CGF.getDestroyer(dtorKind), false); + if (CGF.needsEHCleanup(dtorKind)) { + CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), field->getType(), + CGF.getDestroyer(dtorKind), false); + addCleanup(CGF.EHStack.stable_begin()); pushedCleanup = true; } } @@ -1805,6 +1829,17 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr( if (GEP->use_empty()) GEP->eraseFromParent(); } + + // Deactivate all the partial cleanups in reverse order, which + // generally means popping them. + assert((cleanupDominator || cleanups.empty()) && + "Missing cleanupDominator before deactivating cleanup blocks"); + for (unsigned i = cleanups.size(); i != 0; --i) + CGF.DeactivateCleanupBlock(cleanups[i-1], cleanupDominator); + + // Destroy the placeholder if we made one. + if (cleanupDominator) + cleanupDominator->eraseFromParent(); } void AggExprEmitter::VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E, diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index a88b29b326bb92..a4fb673284ceca 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -1008,8 +1008,8 @@ void CodeGenFunction::EmitNewArrayInitializer( const Expr *Init = E->getInitializer(); Address EndOfInit = Address::invalid(); QualType::DestructionKind DtorKind = ElementType.isDestructedType(); - CleanupDeactivationScope deactivation(*this); - bool pushedCleanup = false; + EHScopeStack::stable_iterator Cleanup; + llvm::Instruction *CleanupDominator = nullptr; CharUnits ElementSize = getContext().getTypeSizeInChars(ElementType); CharUnits ElementAlign = @@ -1105,24 +1105,19 @@ void CodeGenFunction::EmitNewArrayInitializer( } // Enter a partial-destruction Cleanup if necessary. - if (DtorKind) { - AllocaTrackerRAII AllocaTracker(*this); + if (needsEHCleanup(DtorKind)) { // In principle we could tell the Cleanup where we are more // directly, but the control flow can get so varied here that it // would actually be quite complex. Therefore we go through an // alloca. - llvm::Instruction *DominatingIP = - Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(Int8PtrTy)); EndOfInit = CreateTempAlloca(BeginPtr.getType(), getPointerAlign(), "array.init.end"); + CleanupDominator = + Builder.CreateStore(BeginPtr.emitRawPointer(*this), EndOfInit); pushIrregularPartialArrayCleanup(BeginPtr.emitRawPointer(*this), EndOfInit, ElementType, ElementAlign, getDestroyer(DtorKind)); - cast(*EHStack.find(EHStack.stable_begin())) - .AddAuxAllocas(AllocaTracker.Take()); - DeferredDeactivationCleanupStack.push_back( - {EHStack.stable_begin(), DominatingIP}); - pushedCleanup = true; + Cleanup = EHStack.stable_begin(); } CharUnits StartAlign = CurPtr.getAlignment(); @@ -1169,6 +1164,9 @@ void CodeGenFunction::EmitNewArrayInitializer( // initialization. llvm::ConstantInt *ConstNum = dyn_cast(NumElements); if (ConstNum && ConstNum->getZExtValue() <= InitListElements) { + // If there was a Cleanup, deactivate it. + if (CleanupDominator) + DeactivateCleanupBlock(Cleanup, CleanupDominator); return; } @@ -1283,14 +1281,13 @@ void CodeGenFunction::EmitNewArrayInitializer( Builder.CreateStore(CurPtr.emitRawPointer(*this), EndOfInit); // Enter a partial-destruction Cleanup if necessary. - if (!pushedCleanup && needsEHCleanup(DtorKind)) { - llvm::Instruction *DominatingIP = - Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(Int8PtrTy)); - pushRegularPartialArrayCleanup(BeginPtr.emitRawPointer(*this), - CurPtr.emitRawPointer(*this), ElementType, + if (!CleanupDominator && needsEHCleanup(DtorKind)) { + llvm::Value *BeginPtrRaw = BeginPtr.emitRawPointer(*this); + llvm::Value *CurPtrRaw = CurPtr.emitRawPointer(*this); + pushRegularPartialArrayCleanup(BeginPtrRaw, CurPtrRaw, ElementType, ElementAlign, getDestroyer(DtorKind)); - DeferredDeactivationCleanupStack.push_back( - {EHStack.stable_begin(), DominatingIP}); + Cleanup = EHStack.stable_begin(); + CleanupDominator = Builder.CreateUnreachable(); } // Emit the initializer into this element. @@ -1298,7 +1295,10 @@ void CodeGenFunction::EmitNewArrayInitializer( AggValueSlot::DoesNotOverlap); // Leave the Cleanup if we entered one. - deactivation.ForceDeactivate(); + if (CleanupDominator) { + DeactivateCleanupBlock(Cleanup, CleanupDominator); + CleanupDominator->eraseFromParent(); + } // Advance to the next element by adjusting the pointer type as necessary. llvm::Value *NextPtr = Builder.CreateConstInBoundsGEP1_32( diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 87766a758311d5..86a6ddd80cc114 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -91,8 +91,6 @@ CodeGenFunction::CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext) CodeGenFunction::~CodeGenFunction() { assert(LifetimeExtendedCleanupStack.empty() && "failed to emit a cleanup"); - assert(DeferredDeactivationCleanupStack.empty() && - "missed to deactivate a cleanup"); if (getLangOpts().OpenMP && CurFn) CGM.getOpenMPRuntime().functionFinished(*this); @@ -348,10 +346,6 @@ static void EmitIfUsed(CodeGenFunction &CGF, llvm::BasicBlock *BB) { void CodeGenFunction::FinishFunction(SourceLocation EndLoc) { assert(BreakContinueStack.empty() && "mismatched push/pop in break/continue stack!"); - assert(LifetimeExtendedCleanupStack.empty() && - "mismatched push/pop of cleanups in EHStack!"); - assert(DeferredDeactivationCleanupStack.empty() && - "mismatched activate/deactivate of cleanups!"); bool OnlySimpleReturnStmts = NumSimpleReturnExprs > 0 && NumSimpleReturnExprs == NumReturnExprs diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index d99188671f1f60..ff1873325d409f 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -39,7 +39,6 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/SanitizerStats.h" @@ -671,51 +670,6 @@ class CodeGenFunction : public CodeGenTypeCache { EHScopeStack EHStack; llvm::SmallVector LifetimeExtendedCleanupStack; - - // A stack of cleanups which were added to EHStack but have to be deactivated - // later before being popped or emitted. These are usually deactivated on - // exiting a `CleanupDeactivationScope` scope. For instance, after a - // full-expr. - // - // These are specially useful for correctly emitting cleanups while - // encountering branches out of expression (through stmt-expr or coroutine - // suspensions). - struct DeferredDeactivateCleanup { - EHScopeStack::stable_iterator Cleanup; - llvm::Instruction *DominatingIP; - }; - llvm::SmallVector DeferredDeactivationCleanupStack; - - // Enters a new scope for capturing cleanups which are deferred to be - // deactivated, all of which will be deactivated once the scope is exited. - struct CleanupDeactivationScope { - CodeGenFunction &CGF; - size_t OldDeactivateCleanupStackSize; - bool Deactivated; - CleanupDeactivationScope(CodeGenFunction &CGF) - : CGF(CGF), OldDeactivateCleanupStackSize( - CGF.DeferredDeactivationCleanupStack.size()), - Deactivated(false) {} - - void ForceDeactivate() { - assert(!Deactivated && "Deactivating already deactivated scope"); - auto &Stack = CGF.DeferredDeactivationCleanupStack; - for (size_t I = Stack.size(); I > OldDeactivateCleanupStackSize; I--) { - CGF.DeactivateCleanupBlock(Stack[I - 1].Cleanup, - Stack[I - 1].DominatingIP); - Stack[I - 1].DominatingIP->eraseFromParent(); - } - Stack.resize(OldDeactivateCleanupStackSize); - Deactivated = true; - } - - ~CleanupDeactivationScope() { - if (Deactivated) - return; - ForceDeactivate(); - } - }; - llvm::SmallVector SEHTryEpilogueStack; llvm::Instruction *CurrentFuncletPad = nullptr; @@ -921,19 +875,6 @@ class CodeGenFunction : public CodeGenTypeCache { new (Buffer + sizeof(Header) + sizeof(T)) RawAddress(ActiveFlag); } - // Push a cleanup onto EHStack and deactivate it later. It is usually - // deactivated when exiting a `CleanupDeactivationScope` (for example: after a - // full expression). - template - void pushCleanupAndDeferDeactivation(CleanupKind Kind, As... A) { - // Placeholder dominating IP for this cleanup. - llvm::Instruction *DominatingIP = - Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy)); - EHStack.pushCleanup(Kind, A...); - DeferredDeactivationCleanupStack.push_back( - {EHStack.stable_begin(), DominatingIP}); - } - /// Set up the last cleanup that was pushed as a conditional /// full-expression cleanup. void initFullExprCleanup() { @@ -957,8 +898,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// PopCleanupBlock - Will pop the cleanup entry on the stack and /// process all branch fixups. - void PopCleanupBlock(bool FallThroughIsBranchThrough = false, - bool ForDeactivation = false); + void PopCleanupBlock(bool FallThroughIsBranchThrough = false); /// DeactivateCleanupBlock - Deactivates the given cleanup block. /// The block cannot be reactivated. Pops it if it's the top of the @@ -986,7 +926,6 @@ class CodeGenFunction : public CodeGenTypeCache { class RunCleanupsScope { EHScopeStack::stable_iterator CleanupStackDepth, OldCleanupScopeDepth; size_t LifetimeExtendedCleanupStackSize; - CleanupDeactivationScope DeactivateCleanups; bool OldDidCallStackSave; protected: bool PerformCleanup; @@ -1001,7 +940,8 @@ class CodeGenFunction : public CodeGenTypeCache { public: /// Enter a new cleanup scope. explicit RunCleanupsScope(CodeGenFunction &CGF) - : DeactivateCleanups(CGF), PerformCleanup(true), CGF(CGF) { + : PerformCleanup(true), CGF(CGF) + { CleanupStackDepth = CGF.EHStack.stable_begin(); LifetimeExtendedCleanupStackSize = CGF.LifetimeExtendedCleanupStack.size(); @@ -1031,7 +971,6 @@ class CodeGenFunction : public CodeGenTypeCache { void ForceCleanup(std::initializer_list ValuesToReload = {}) { assert(PerformCleanup && "Already forced cleanup"); CGF.DidCallStackSave = OldDidCallStackSave; - DeactivateCleanups.ForceDeactivate(); CGF.PopCleanupBlocks(CleanupStackDepth, LifetimeExtendedCleanupStackSize, ValuesToReload); PerformCleanup = false; @@ -2221,11 +2160,6 @@ class CodeGenFunction : public CodeGenTypeCache { Address addr, QualType type); void pushDestroy(CleanupKind kind, Address addr, QualType type, Destroyer *destroyer, bool useEHCleanupForArray); - void pushDestroyAndDeferDeactivation(QualType::DestructionKind dtorKind, - Address addr, QualType type); - void pushDestroyAndDeferDeactivation(CleanupKind cleanupKind, Address addr, - QualType type, Destroyer *destroyer, - bool useEHCleanupForArray); void pushLifetimeExtendedDestroy(CleanupKind kind, Address addr, QualType type, Destroyer *destroyer, bool useEHCleanupForArray); @@ -2764,33 +2698,6 @@ class CodeGenFunction : public CodeGenTypeCache { TBAAAccessInfo *TBAAInfo = nullptr); LValue EmitLoadOfPointerLValue(Address Ptr, const PointerType *PtrTy); -private: - struct AllocaTracker { - void Add(llvm::AllocaInst *I) { Allocas.push_back(I); } - llvm::SmallVector Take() { return std::move(Allocas); } - - private: - llvm::SmallVector Allocas; - }; - AllocaTracker *Allocas = nullptr; - -public: - // Captures all the allocas created during the scope of its RAII object. - struct AllocaTrackerRAII { - AllocaTrackerRAII(CodeGenFunction &CGF) - : CGF(CGF), OldTracker(CGF.Allocas) { - CGF.Allocas = &Tracker; - } - ~AllocaTrackerRAII() { CGF.Allocas = OldTracker; } - - llvm::SmallVector Take() { return Tracker.Take(); } - - private: - CodeGenFunction &CGF; - AllocaTracker *OldTracker; - AllocaTracker Tracker; - }; - /// CreateTempAlloca - This creates an alloca and inserts it into the entry /// block if \p ArraySize is nullptr, otherwise inserts it at the current /// insertion point of the builder. The caller is responsible for setting an diff --git a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp deleted file mode 100644 index 0a51b0e4121c33..00000000000000 --- a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp +++ /dev/null @@ -1,409 +0,0 @@ -// RUN: %clang_cc1 --std=c++20 -fexceptions -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefixes=EH %s -// RUN: %clang_cc1 --std=c++20 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefixes=NOEH,CHECK %s - -struct Printy { - Printy(const char *name) : name(name) {} - ~Printy() {} - const char *name; -}; - -int foo() { return 2; } - -struct Printies { - Printy a; - Printy b; - Printy c; -}; - -void ParenInit() { - // CHECK-LABEL: define dso_local void @_Z9ParenInitv() - // CHECK: [[CLEANUP_DEST:%.+]] = alloca i32, align 4 - Printies ps(Printy("a"), - // CHECK: call void @_ZN6PrintyC1EPKc - ({ - if (foo()) return; - // CHECK: if.then: - // CHECK-NEXT: store i32 1, ptr [[CLEANUP_DEST]], align 4 - // CHECK-NEXT: br label %cleanup - Printy("b"); - // CHECK: if.end: - // CHECK-NEXT: call void @_ZN6PrintyC1EPKc - }), - ({ - if (foo()) return; - // CHECK: if.then{{.*}}: - // CHECK-NEXT: store i32 1, ptr [[CLEANUP_DEST]], align 4 - // CHECK-NEXT: call void @_ZN6PrintyD1Ev - // CHECK-NEXT: br label %cleanup - Printy("c"); - // CHECK: if.end{{.*}}: - // CHECK-NEXT: call void @_ZN6PrintyC1EPKc - // CHECK-NEXT: call void @_ZN8PrintiesD1Ev - // CHECK-NEXT: br label %return - })); - // CHECK: cleanup: - // CHECK-NEXT: call void @_ZN6PrintyD1Ev - // CHECK-NEXT: br label %return -} - -void break_in_stmt_expr() { - // Verify that the "break" in "if.then".calls dtor before jumping to "for.end". - - // CHECK-LABEL: define dso_local void @_Z18break_in_stmt_exprv() - Printies p{Printy("a"), - // CHECK: call void @_ZN6PrintyC1EPKc - ({ - for (;;) { - Printies ps{ - Printy("b"), - // CHECK: for.cond: - // CHECK: call void @_ZN6PrintyC1EPKc - ({ - if (foo()) { - break; - // CHECK: if.then: - // CHECK-NEXT: call void @_ZN6PrintyD1Ev - // CHECK-NEXT: br label %for.end - } - Printy("c"); - // CHECK: if.end: - // CHECK-NEXT: call void @_ZN6PrintyC1EPKc - }), - Printy("d")}; - // CHECK: call void @_ZN6PrintyC1EPKc - // CHECK-NEXT: call void @_ZN8PrintiesD1Ev - // CHECK-NEXT: br label %for.cond - } - Printy("e"); - // CHECK: for.end: - // CHECK-NEXT: call void @_ZN6PrintyC1EPKc - }), - Printy("f")}; - // CHECK: call void @_ZN6PrintyC1EPKc - // CHECK-NEXT: call void @_ZN8PrintiesD1Ev -} - -void goto_in_stmt_expr() { - // Verify that: - // - correct branch fixups for deactivated normal cleanups are generated correctly. - - // CHECK-LABEL: define dso_local void @_Z17goto_in_stmt_exprv() - // CHECK: [[CLEANUP_DEST_SLOT:%cleanup.dest.slot.*]] = alloca i32, align 4 - { - Printies p1{Printy("a"), // CHECK: call void @_ZN6PrintyC1EPKc - ({ - { - Printies p2{Printy("b"), - // CHECK: call void @_ZN6PrintyC1EPKc - ({ - if (foo() == 1) { - goto in; - // CHECK: if.then: - // CHECK-NEXT: store i32 2, ptr [[CLEANUP_DEST_SLOT]], align 4 - // CHECK-NEXT: br label %[[CLEANUP1:.+]] - } - if (foo() == 2) { - goto out; - // CHECK: if.then{{.*}}: - // CHECK-NEXT: store i32 3, ptr [[CLEANUP_DEST_SLOT]], align 4 - // CHECK-NEXT: br label %[[CLEANUP1]] - } - Printy("c"); - // CHECK: if.end{{.*}}: - // CHECK-NEXT: call void @_ZN6PrintyC1EPKc - }), - Printy("d")}; - // CHECK: call void @_ZN6PrintyC1EPKc - // CHECK-NEXT: call void @_ZN8PrintiesD1Ev - // CHECK-NEXT: br label %in - - } - in: - Printy("e"); - // CHECK: in: ; preds = %if.end{{.*}}, %[[CLEANUP1]] - // CHECK-NEXT: call void @_ZN6PrintyC1EPKc - }), - Printy("f")}; - // CHECK: call void @_ZN6PrintyC1EPKc - // CHECK-NEXT: call void @_ZN8PrintiesD1Ev - // CHECK-NEXT: br label %out - } -out: - return; - // CHECK: out: - // CHECK-NEXT: ret void - - // CHECK: [[CLEANUP1]]: ; preds = %if.then{{.*}}, %if.then - // CHECK-NEXT: call void @_ZN6PrintyD1Ev - // CHECK-NEXT: %cleanup.dest = load i32, ptr [[CLEANUP_DEST_SLOT]], align 4 - // CHECK-NEXT: switch i32 %cleanup.dest, label %[[CLEANUP2:.+]] [ - // CHECK-NEXT: i32 2, label %in - // CHECK-NEXT: ] - - // CHECK: [[CLEANUP2]]: ; preds = %[[CLEANUP1]] - // CHECK-NEXT: call void @_ZN6PrintyD1Ev - // CHECK-NEXT: %cleanup.dest{{.*}} = load i32, ptr [[CLEANUP_DEST_SLOT]], align 4 - // CHECK-NEXT: switch i32 %cleanup.dest{{.*}}, label %unreachable [ - // CHECK-NEXT: i32 3, label %out - // CHECK-NEXT: ] -} - -void ArrayInit() { - // Printy arr[4] = {ctorA, ctorB, stmt-exprC, stmt-exprD}; - // Verify that: - // - We do the necessary stores for array cleanups (endOfInit and last constructed element). - // - We update the array init element correctly for ctorA, ctorB and stmt-exprC. - // - stmt-exprC and stmt-exprD share the array body dtor code (see %cleanup). - - // CHECK-LABEL: define dso_local void @_Z9ArrayInitv() - // CHECK: %arrayinit.endOfInit = alloca ptr, align 8 - // CHECK: %cleanup.dest.slot = alloca i32, align 4 - // CHECK: %arrayinit.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i64 0, i64 0 - // CHECK: store ptr %arrayinit.begin, ptr %arrayinit.endOfInit, align 8 - Printy arr[4] = { - Printy("a"), - // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str) - // CHECK: [[ARRAYINIT_ELEMENT1:%.+]] = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1 - // CHECK: store ptr [[ARRAYINIT_ELEMENT1]], ptr %arrayinit.endOfInit, align 8 - Printy("b"), - // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT1]], ptr noundef @.str.1) - // CHECK: [[ARRAYINIT_ELEMENT2:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAYINIT_ELEMENT1]], i64 1 - // CHECK: store ptr [[ARRAYINIT_ELEMENT2]], ptr %arrayinit.endOfInit, align 8 - ({ - // CHECK: br i1 {{.*}}, label %if.then, label %if.end - if (foo()) { - return; - // CHECK: if.then: - // CHECK-NEXT: store i32 1, ptr %cleanup.dest.slot, align 4 - // CHECK-NEXT: br label %cleanup - } - // CHECK: if.end: - Printy("c"); - // CHECK-NEXT: call void @_ZN6PrintyC1EPKc - // CHECK-NEXT: %arrayinit.element2 = getelementptr inbounds %struct.Printy, ptr %arrayinit.element1, i64 1 - // CHECK-NEXT: store ptr %arrayinit.element2, ptr %arrayinit.endOfInit, align 8 - }), - ({ - // CHECK: br i1 {{%.+}} label %[[IF_THEN2:.+]], label %[[IF_END2:.+]] - if (foo()) { - return; - // CHECK: [[IF_THEN2]]: - // CHECK-NEXT: store i32 1, ptr %cleanup.dest.slot, align 4 - // CHECK-NEXT: br label %cleanup - } - // CHECK: [[IF_END2]]: - Printy("d"); - // CHECK-NEXT: call void @_ZN6PrintyC1EPKc - // CHECK-NEXT: %array.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i32 0, i32 0 - // CHECK-NEXT: %0 = getelementptr inbounds %struct.Printy, ptr %array.begin, i64 4 - // CHECK-NEXT: br label %[[ARRAY_DESTROY_BODY1:.+]] - }), - }; - - // CHECK: [[ARRAY_DESTROY_BODY1]]: - // CHECK-NEXT: %arraydestroy.elementPast{{.*}} = phi ptr [ %0, %[[IF_END2]] ], [ %arraydestroy.element{{.*}}, %[[ARRAY_DESTROY_BODY1]] ] - // CHECK-NEXT: %arraydestroy.element{{.*}} = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast{{.*}}, i64 -1 - // CHECK-NEXT: call void @_ZN6PrintyD1Ev - // CHECK-NEXT: %arraydestroy.done{{.*}} = icmp eq ptr %arraydestroy.element{{.*}}, %array.begin - // CHECK-NEXT: br i1 %arraydestroy.done{{.*}}, label %[[ARRAY_DESTROY_DONE1:.+]], label %[[ARRAY_DESTROY_BODY1]] - - // CHECK: [[ARRAY_DESTROY_DONE1]]: - // CHECK-NEXT: ret void - - // CHECK: cleanup: - // CHECK-NEXT: %1 = load ptr, ptr %arrayinit.endOfInit, align 8 - // CHECK-NEXT: %arraydestroy.isempty = icmp eq ptr %arrayinit.begin, %1 - // CHECK-NEXT: br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2:.+]] - - // CHECK: [[ARRAY_DESTROY_BODY2]]: - // CHECK-NEXT: %arraydestroy.elementPast = phi ptr [ %1, %cleanup ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY2]] ] - // CHECK-NEXT: %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1 - // CHECK-NEXT: call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element) - // CHECK-NEXT: %arraydestroy.done = icmp eq ptr %arraydestroy.element, %arrayinit.begin - // CHECK-NEXT: br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE2]], label %[[ARRAY_DESTROY_BODY2]] - - // CHECK: [[ARRAY_DESTROY_DONE2]]: - // CHECK-NEXT: br label %[[ARRAY_DESTROY_DONE1]] -} - -void ArraySubobjects() { - struct S { - Printy arr1[2]; - Printy arr2[2]; - Printy p; - }; - // CHECK-LABEL: define dso_local void @_Z15ArraySubobjectsv() - // CHECK: %arrayinit.endOfInit = alloca ptr, align 8 - S s{{Printy("a"), Printy("b")}, - // CHECK: call void @_ZN6PrintyC1EPKc - // CHECK: call void @_ZN6PrintyC1EPKc - {Printy("a"), - // CHECK: [[ARRAYINIT_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy] - // CHECK: store ptr [[ARRAYINIT_BEGIN]], ptr %arrayinit.endOfInit, align 8 - // CHECK: call void @_ZN6PrintyC1EPKc - // CHECK: [[ARRAYINIT_ELEMENT:%.+]] = getelementptr inbounds %struct.Printy - // CHECK: store ptr [[ARRAYINIT_ELEMENT]], ptr %arrayinit.endOfInit, align 8 - ({ - if (foo()) { - return; - // CHECK: if.then: - // CHECK-NEXT: [[V0:%.+]] = load ptr, ptr %arrayinit.endOfInit, align 8 - // CHECK-NEXT: %arraydestroy.isempty = icmp eq ptr [[ARRAYINIT_BEGIN]], [[V0]] - // CHECK-NEXT: br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE:.+]], label %[[ARRAY_DESTROY_BODY:.+]] - } - Printy("b"); - }) - }, - Printy("c") - // CHECK: if.end: - // CHECK-NEXT: call void @_ZN6PrintyC1EPKc - // CHECK: call void @_ZN6PrintyC1EPKc - // CHECK-NEXT: call void @_ZZ15ArraySubobjectsvEN1SD1Ev - // CHECK-NEXT: br label %return - }; - // CHECK: return: - // CHECK-NEXT: ret void - - // CHECK: [[ARRAY_DESTROY_BODY]]: - // CHECK-NEXT: %arraydestroy.elementPast = phi ptr [ %0, %if.then ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY]] ] - // CHECK-NEXT: %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1 - // CHECK-NEXT: call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element) - // CHECK-NEXT: %arraydestroy.done = icmp eq ptr %arraydestroy.element, [[ARRAYINIT_BEGIN]] - // CHECK-NEXT: br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE]], label %[[ARRAY_DESTROY_BODY]] - - // CHECK: [[ARRAY_DESTROY_DONE]] - // CHECK-NEXT: [[ARRAY_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy], ptr %arr1, i32 0, i32 0 - // CHECK-NEXT: [[V1:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAY_BEGIN]], i64 2 - // CHECK-NEXT: br label %[[ARRAY_DESTROY_BODY2:.+]] - - // CHECK: [[ARRAY_DESTROY_BODY2]]: - // CHECK-NEXT: %arraydestroy.elementPast5 = phi ptr [ %1, %[[ARRAY_DESTROY_DONE]] ], [ %arraydestroy.element6, %[[ARRAY_DESTROY_BODY2]] ] - // CHECK-NEXT: %arraydestroy.element6 = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast5, i64 -1 - // CHECK-NEXT: call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element6) - // CHECK-NEXT: %arraydestroy.done7 = icmp eq ptr %arraydestroy.element6, [[ARRAY_BEGIN]] - // CHECK-NEXT: br i1 %arraydestroy.done7, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2]] - - - // CHECK: [[ARRAY_DESTROY_DONE2]]: - // CHECK-NEXT: br label %return -} - -void LambdaInit() { - // CHECK-LABEL: define dso_local void @_Z10LambdaInitv() - auto S = [a = Printy("a"), b = ({ - if (foo()) { - return; - // CHECK: if.then: - // CHECK-NEXT: call void @_ZN6PrintyD1Ev - // CHECK-NEXT: br label %return - } - Printy("b"); - })]() { return a; }; -} - -void LifetimeExtended() { - // CHECK-LABEL: define dso_local void @_Z16LifetimeExtendedv - struct PrintyRefBind { - const Printy &a; - const Printy &b; - }; - PrintyRefBind ps = {Printy("a"), ({ - if (foo()) { - return; - // CHECK: if.then: - // CHECK-NEXT: call void @_ZN6PrintyD1Ev - // CHECK-NEXT: br label %return - } - Printy("b"); - })}; -} - -void NewArrayInit() { - // CHECK-LABEL: define dso_local void @_Z12NewArrayInitv() - // CHECK: %array.init.end = alloca ptr, align 8 - // CHECK: store ptr %0, ptr %array.init.end, align 8 - Printy *array = new Printy[3]{ - "a", - // CHECK: call void @_ZN6PrintyC1EPKc - // CHECK: store ptr %array.exp.next, ptr %array.init.end, align 8 - "b", - // CHECK: call void @_ZN6PrintyC1EPKc - // CHECK: store ptr %array.exp.next1, ptr %array.init.end, align 8 - ({ - if (foo()) { - return; - // CHECK: if.then: - // CHECK: br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body - } - "b"; - // CHECK: if.end: - // CHECK: call void @_ZN6PrintyC1EPKc - })}; - // CHECK: arraydestroy.body: - // CHECK-NEXT: %arraydestroy.elementPast = phi ptr [ %{{.*}}, %if.then ], [ %arraydestroy.element, %arraydestroy.body ] - // CHECK-NEXT: %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1 - // CHECK-NEXT: call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element) - // CHECK-NEXT: %arraydestroy.done = icmp eq ptr %arraydestroy.element, %0 - // CHECK-NEXT: br i1 %arraydestroy.done, label %arraydestroy.done{{.*}}, label %arraydestroy.body - - // CHECK: arraydestroy.done{{.*}}: ; preds = %arraydestroy.body, %if.then - // CHECK-NEXT: br label %return -} - -void DestroyInConditionalCleanup() { - // EH-LABEL: DestroyInConditionalCleanupv() - // NOEH-LABEL: DestroyInConditionalCleanupv() - struct A { - A() {} - ~A() {} - }; - - struct Value { - Value(A) {} - ~Value() {} - }; - - struct V2 { - Value K; - Value V; - }; - // Verify we use conditional cleanups. - (void)(foo() ? V2{A(), A()} : V2{A(), A()}); - // NOEH: cond.true: - // NOEH: call void @_ZZ27DestroyInConditionalCleanupvEN1AC1Ev - // NOEH: store ptr %{{.*}}, ptr %cond-cleanup.save - - // EH: cond.true: - // EH: invoke void @_ZZ27DestroyInConditionalCleanupvEN1AC1Ev - // EH: store ptr %{{.*}}, ptr %cond-cleanup.save -} - -void ArrayInitWithContinue() { - // CHECK-LABEL: @_Z21ArrayInitWithContinuev - // Verify that we start to emit the array destructor. - // CHECK: %arrayinit.endOfInit = alloca ptr, align 8 - for (int i = 0; i < 1; ++i) { - Printy arr[2] = {"a", ({ - if (foo()) { - continue; - } - "b"; - })}; - } -} - -struct [[clang::trivial_abi]] HasTrivialABI { - HasTrivialABI(); - ~HasTrivialABI(); -}; -void AcceptTrivialABI(HasTrivialABI, int); -void TrivialABI() { - // CHECK-LABEL: define dso_local void @_Z10TrivialABIv() - AcceptTrivialABI(HasTrivialABI(), ({ - if (foo()) return; - // CHECK: if.then: - // CHECK-NEXT: call void @_ZN13HasTrivialABID1Ev - // CHECK-NEXT: br label %return - 0; - })); -} diff --git a/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp b/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp deleted file mode 100644 index 06cc2069dbe9ae..00000000000000 --- a/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp +++ /dev/null @@ -1,93 +0,0 @@ -// RUN: %clang_cc1 --std=c++20 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s - -#include "Inputs/coroutine.h" - -struct Printy { - Printy(const char *name) : name(name) {} - ~Printy() {} - const char *name; -}; - -struct coroutine { - struct promise_type; - std::coroutine_handle handle; - ~coroutine() { - if (handle) handle.destroy(); - } -}; - -struct coroutine::promise_type { - coroutine get_return_object() { - return {std::coroutine_handle::from_promise(*this)}; - } - std::suspend_never initial_suspend() noexcept { return {}; } - std::suspend_always final_suspend() noexcept { return {}; } - void return_void() {} - void unhandled_exception() {} -}; - -struct Awaiter : std::suspend_always { - Printy await_resume() { return {"awaited"}; } -}; - -int foo() { return 2; } - -coroutine ArrayInitCoro() { - // Verify that: - // - We do the necessary stores for array cleanups. - // - Array cleanups are called by await.cleanup. - // - We activate the cleanup after the first element and deactivate it in await.ready (see cleanup.isactive). - - // CHECK-LABEL: define dso_local void @_Z13ArrayInitCorov - // CHECK: %arrayinit.endOfInit = alloca ptr, align 8 - // CHECK: %cleanup.isactive = alloca i1, align 1 - Printy arr[2] = { - Printy("a"), - // CHECK: %arrayinit.begin = getelementptr inbounds [2 x %struct.Printy], ptr %arr.reload.addr, i64 0, i64 0 - // CHECK-NEXT: %arrayinit.begin.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 10 - // CHECK-NEXT: store ptr %arrayinit.begin, ptr %arrayinit.begin.spill.addr, align 8 - // CHECK-NEXT: store i1 true, ptr %cleanup.isactive.reload.addr, align 1 - // CHECK-NEXT: store ptr %arrayinit.begin, ptr %arrayinit.endOfInit.reload.addr, align 8 - // CHECK-NEXT: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str) - // CHECK-NEXT: %arrayinit.element = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1 - // CHECK-NEXT: %arrayinit.element.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11 - // CHECK-NEXT: store ptr %arrayinit.element, ptr %arrayinit.element.spill.addr, align 8 - // CHECK-NEXT: store ptr %arrayinit.element, ptr %arrayinit.endOfInit.reload.addr, align 8 - co_await Awaiter{} - // CHECK-NEXT: @_ZNSt14suspend_always11await_readyEv - // CHECK-NEXT: br i1 %{{.+}}, label %await.ready, label %CoroSave30 - }; - // CHECK: await.cleanup: ; preds = %AfterCoroSuspend{{.*}} - // CHECK-NEXT: br label %cleanup{{.*}}.from.await.cleanup - - // CHECK: cleanup{{.*}}.from.await.cleanup: ; preds = %await.cleanup - // CHECK: br label %cleanup{{.*}} - - // CHECK: await.ready: - // CHECK-NEXT: %arrayinit.element.reload.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11 - // CHECK-NEXT: %arrayinit.element.reload = load ptr, ptr %arrayinit.element.reload.addr, align 8 - // CHECK-NEXT: call void @_ZN7Awaiter12await_resumeEv - // CHECK-NEXT: store i1 false, ptr %cleanup.isactive.reload.addr, align 1 - // CHECK-NEXT: br label %cleanup{{.*}}.from.await.ready - - // CHECK: cleanup{{.*}}: ; preds = %cleanup{{.*}}.from.await.ready, %cleanup{{.*}}.from.await.cleanup - // CHECK: %cleanup.is_active = load i1, ptr %cleanup.isactive.reload.addr, align 1 - // CHECK-NEXT: br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done - - // CHECK: cleanup.action: - // CHECK: %arraydestroy.isempty = icmp eq ptr %arrayinit.begin.reload{{.*}}, %{{.*}} - // CHECK-NEXT: br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body.from.cleanup.action - // Ignore rest of the array cleanup. -} - -coroutine ArrayInitWithCoReturn() { - // CHECK-LABEL: define dso_local void @_Z21ArrayInitWithCoReturnv - // Verify that we start to emit the array destructor. - // CHECK: %arrayinit.endOfInit = alloca ptr, align 8 - Printy arr[2] = {"a", ({ - if (foo()) { - co_return; - } - "b"; - })}; -} From 9ce74d6d470437cde82e91631395322ec1c36eeb Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 16 Apr 2024 09:34:18 -0400 Subject: [PATCH 099/300] [AMDGPU][CodeGen] Improve handling of memcpy for -Os/-Oz compilations (#87632) We had some instances when LLVM would not inline fixed-count memcpy and ended up attempting to lower it a a libcall, which would not work on AMDGPU as the address space doesn't meet the requirement, causing compiler crash. The patch relaxes the threshold used for -Os/-Oz compilation so we're always allowed to inline memory copy functions. This patch basically does the same thing as https://reviews.llvm.org/D158226 for AMDGPU. Fix #88497. --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 6 + llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 2696 +++++++++++++++++ 2 files changed, 2702 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f283af6fa07d3e..db69d50799e70b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -59,6 +59,12 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { + // Always lower memset, memcpy, and memmove intrinsics to load/store + // instructions, rather then generating calls to memset, mempcy or memmove. + MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U; + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U; + MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U; + // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::LOAD, MVT::f32, Promote); diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll new file mode 100644 index 00000000000000..358f42dfe8dd5a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -0,0 +1,2696 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - | FileCheck %s + +%struct.S = type { [32 x i32] } + +@shared = addrspace(3) global %struct.S undef, align 4 + +define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 { +; CHECK-LABEL: memcpy_p0_p0_minsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45 +; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 { +; CHECK-LABEL: memcpy_p1_p1_minsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 { +; CHECK-LABEL: memcpy_p1_p4_minsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 { +; CHECK-LABEL: memcpy_p5_p4_minsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_add_u32 s8, s8, s7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18 +; CHECK-NEXT: s_waitcnt vmcnt(18) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22 +; CHECK-NEXT: s_waitcnt vmcnt(22) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(23) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(27) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(28) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29 +; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(30) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31 +; CHECK-NEXT: s_waitcnt vmcnt(31) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(32) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33 +; CHECK-NEXT: s_waitcnt vmcnt(33) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34 +; CHECK-NEXT: s_waitcnt vmcnt(34) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90 +; CHECK-NEXT: s_waitcnt vmcnt(34) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91 +; CHECK-NEXT: s_waitcnt vmcnt(33) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92 +; CHECK-NEXT: s_waitcnt vmcnt(32) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93 +; CHECK-NEXT: s_waitcnt vmcnt(31) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94 +; CHECK-NEXT: s_waitcnt vmcnt(30) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95 +; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(28) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97 +; CHECK-NEXT: s_waitcnt vmcnt(27) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121 +; CHECK-NEXT: s_waitcnt vmcnt(23) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122 +; CHECK-NEXT: s_waitcnt vmcnt(22) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 { +; CHECK-LABEL: memcpy_p0_p5_minsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: s_add_u32 s8, s8, s7 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: s_waitcnt vmcnt(17) +; CHECK-NEXT: flat_store_byte v[0:1], v3 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126 +; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 { +; CHECK-LABEL: memcpy_p3_p4_minsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v24, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { +; CHECK-LABEL: memcpy_p0_p3_minsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:127 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:126 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:125 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:124 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:123 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:122 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:121 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:119 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:118 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:117 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:116 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:115 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:114 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:113 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:112 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:111 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:110 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:109 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:108 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:107 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:104 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:103 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:102 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:101 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:100 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:98 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:97 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:96 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:95 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:94 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:93 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:92 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:91 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:90 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:89 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:88 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:87 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:86 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:85 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:84 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:83 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:82 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:81 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:79 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:78 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:77 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:76 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:75 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:74 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:73 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:72 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:71 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:70 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:69 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:68 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:67 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:64 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:63 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:62 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:61 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:60 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:58 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:57 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:56 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:55 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:54 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:53 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:52 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:51 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:50 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:49 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:47 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:46 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:45 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:44 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:43 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:42 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:41 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:39 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:38 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:37 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:36 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:35 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:34 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:33 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:32 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:31 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:29 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:27 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:24 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:23 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:22 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:21 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:20 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:19 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:18 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:16 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:17 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:8 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:9 +; CHECK-NEXT: ds_read_u8 v7, v2 offset:10 +; CHECK-NEXT: ds_read_u8 v8, v2 offset:11 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:12 +; CHECK-NEXT: ds_read_u8 v10, v2 offset:13 +; CHECK-NEXT: ds_read_u8 v11, v2 offset:14 +; CHECK-NEXT: ds_read_u8 v12, v2 offset:15 +; CHECK-NEXT: ds_read_u8 v13, v2 +; CHECK-NEXT: ds_read_u8 v14, v2 offset:1 +; CHECK-NEXT: ds_read_u8 v15, v2 offset:2 +; CHECK-NEXT: ds_read_u8 v16, v2 offset:3 +; CHECK-NEXT: ds_read_u8 v17, v2 offset:4 +; CHECK-NEXT: ds_read_u8 v18, v2 offset:5 +; CHECK-NEXT: ds_read_u8 v19, v2 offset:6 +; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1 +; CHECK-NEXT: flat_store_byte v[0:1], v13 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { +; CHECK-LABEL: memcpy_p0_p0_optsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] +; CHECK-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44 +; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45 +; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 { +; CHECK-LABEL: memcpy_p1_p1_optsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 { +; CHECK-LABEL: memcpy_p1_p4_optsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 { +; CHECK-LABEL: memcpy_p5_p4_optsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_add_u32 s8, s8, s7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18 +; CHECK-NEXT: s_waitcnt vmcnt(18) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22 +; CHECK-NEXT: s_waitcnt vmcnt(22) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(23) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(27) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(28) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29 +; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(30) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31 +; CHECK-NEXT: s_waitcnt vmcnt(31) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(32) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33 +; CHECK-NEXT: s_waitcnt vmcnt(33) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34 +; CHECK-NEXT: s_waitcnt vmcnt(34) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89 +; CHECK-NEXT: s_waitcnt vmcnt(35) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90 +; CHECK-NEXT: s_waitcnt vmcnt(34) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91 +; CHECK-NEXT: s_waitcnt vmcnt(33) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92 +; CHECK-NEXT: s_waitcnt vmcnt(32) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93 +; CHECK-NEXT: s_waitcnt vmcnt(31) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94 +; CHECK-NEXT: s_waitcnt vmcnt(30) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95 +; CHECK-NEXT: s_waitcnt vmcnt(29) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96 +; CHECK-NEXT: s_waitcnt vmcnt(28) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97 +; CHECK-NEXT: s_waitcnt vmcnt(27) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100 +; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108 +; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109 +; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 +; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111 +; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112 +; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113 +; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114 +; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115 +; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116 +; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117 +; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118 +; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101 +; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102 +; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103 +; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104 +; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105 +; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106 +; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107 +; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118 +; CHECK-NEXT: s_waitcnt vmcnt(26) +; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119 +; CHECK-NEXT: s_waitcnt vmcnt(25) +; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120 +; CHECK-NEXT: s_waitcnt vmcnt(24) +; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121 +; CHECK-NEXT: s_waitcnt vmcnt(23) +; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122 +; CHECK-NEXT: s_waitcnt vmcnt(22) +; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126 +; CHECK-NEXT: s_waitcnt vmcnt(19) +; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 { +; CHECK-LABEL: memcpy_p0_p5_optsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3] +; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1] +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: s_add_u32 s8, s8, s7 +; CHECK-NEXT: s_addc_u32 s9, s9, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: s_waitcnt vmcnt(17) +; CHECK-NEXT: flat_store_byte v[0:1], v3 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101 +; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109 +; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110 +; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111 +; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112 +; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113 +; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114 +; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116 +; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117 +; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118 +; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102 +; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103 +; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104 +; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105 +; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106 +; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107 +; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108 +; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117 +; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124 +; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126 +; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 { +; CHECK-LABEL: memcpy_p3_p4_optsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v24, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7 +; CHECK-NEXT: s_waitcnt vmcnt(2) +; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5 +; CHECK-NEXT: s_waitcnt vmcnt(1) +; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false) + ret void +} + +define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { +; CHECK-LABEL: memcpy_p0_p3_optsize: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:127 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:126 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:125 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:124 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:123 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:122 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:121 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:119 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:118 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:117 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:116 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:115 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:114 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:113 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:112 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:111 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:110 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:109 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:108 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:107 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:104 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:103 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:102 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:101 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:100 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:98 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:97 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:96 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:95 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:94 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:93 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:92 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:91 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:90 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:89 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:88 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:87 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:86 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:85 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:84 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:83 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:82 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:81 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:79 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:78 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:77 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:76 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:75 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:74 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:73 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:72 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:71 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:70 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:69 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:68 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:67 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:64 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:63 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:62 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:61 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:60 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:58 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:57 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:56 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:55 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:54 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:53 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:52 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:51 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:50 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:49 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:47 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:46 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:45 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:44 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:43 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:42 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:41 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:39 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:38 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:37 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:36 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:35 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:34 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:33 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:32 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:31 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:29 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:27 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:24 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:23 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:22 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:21 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:20 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:19 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:18 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19 +; CHECK-NEXT: ds_read_u8 v3, v2 offset:16 +; CHECK-NEXT: ds_read_u8 v5, v2 offset:17 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18 +; CHECK-NEXT: ds_read_u8 v4, v2 offset:8 +; CHECK-NEXT: ds_read_u8 v6, v2 offset:9 +; CHECK-NEXT: ds_read_u8 v7, v2 offset:10 +; CHECK-NEXT: ds_read_u8 v8, v2 offset:11 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:12 +; CHECK-NEXT: ds_read_u8 v10, v2 offset:13 +; CHECK-NEXT: ds_read_u8 v11, v2 offset:14 +; CHECK-NEXT: ds_read_u8 v12, v2 offset:15 +; CHECK-NEXT: ds_read_u8 v13, v2 +; CHECK-NEXT: ds_read_u8 v14, v2 offset:1 +; CHECK-NEXT: ds_read_u8 v15, v2 offset:2 +; CHECK-NEXT: ds_read_u8 v16, v2 offset:3 +; CHECK-NEXT: ds_read_u8 v17, v2 offset:4 +; CHECK-NEXT: ds_read_u8 v18, v2 offset:5 +; CHECK-NEXT: ds_read_u8 v19, v2 offset:6 +; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17 +; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 +; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14 +; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12 +; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 +; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9 +; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 +; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5 +; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1 +; CHECK-NEXT: flat_store_byte v[0:1], v13 +; CHECK-NEXT: s_endpgm +entry: + tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) + ret void +} + +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2 + +declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2 + +declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2 + +declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2 + +declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2 + +declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2 + +declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2 + +attributes #0 = { minsize } +attributes #1 = { optsize } +attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } From 6ab5927238395798c2e8b657bb59a0304046099d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 16 Apr 2024 06:13:03 -0700 Subject: [PATCH 100/300] [SLP][NFC]Add a test with the incorrect vectorization of smax with unsigned arg. --- .../RISCV/smax-unsigned-operand.ll | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll new file mode 100644 index 00000000000000..577d995f6a7654 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +@e = global [2 x i8] zeroinitializer + +define void @main(ptr noalias %p) { +; CHECK-LABEL: define void @main( +; CHECK-SAME: ptr noalias [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> , i32 [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP1]], <2 x i32> ) +; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8> +; CHECK-NEXT: store <2 x i8> [[TMP3]], ptr getelementptr inbounds ([11 x i8], ptr @e, i64 0, i64 4), align 1 +; CHECK-NEXT: ret void +; +bb: + %conv.4 = zext i32 0 to i64 + %cond.4 = tail call i64 @llvm.smax.i64(i64 %conv.4, i64 0) + %conv5.4 = trunc i64 %cond.4 to i8 + store i8 %conv5.4, ptr getelementptr inbounds ([11 x i8], ptr @e, i64 0, i64 4), align 1 + %0 = load i32, ptr %p, align 4 + %conv.5 = zext i32 %0 to i64 + %cond.5 = tail call i64 @llvm.smax.i64(i64 %conv.5, i64 1) + %conv5.5 = trunc i64 %cond.5 to i8 + store i8 %conv5.5, ptr getelementptr inbounds ([11 x i8], ptr @e, i64 0, i64 5), align 1 + ret void +} + From 26ebe16d78b22329d602db0398ce163ad610b0dc Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 16 Apr 2024 06:42:15 -0700 Subject: [PATCH 101/300] [SLP]Fix PR88834: check if unsigned arg can be trunced, being used in smax/smin intrinsics. Need to check that unsigned argument can be safely used in smax/smin intrinsics by checking if at least single sign bit is cleared, otherwise its value may be treated as negative instead of positive. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 +++++++- .../SLPVectorizer/RISCV/smax-unsigned-operand.ll | 12 ++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index d0bcdceae392bd..0cd3ca32933ca2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -14639,10 +14639,16 @@ bool BoUpSLP::collectValuesToDemote( assert((ID == Intrinsic::smin || ID == Intrinsic::smax) && "Expected min/max intrinsics only."); unsigned SignBits = OrigBitWidth - BitWidth; + APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1); return SignBits <= ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT) && + (!isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL)) || + MaskedValueIsZero(I->getOperand(0), Mask, + SimplifyQuery(*DL))) && SignBits <= ComputeNumSignBits(I->getOperand(1), *DL, 0, AC, - nullptr, DT); + nullptr, DT) && + (!isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL)) || + MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL))); }); }; if (ID != Intrinsic::abs) { diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll index 577d995f6a7654..5db148ac1b4855 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll @@ -9,11 +9,15 @@ define void @main(ptr noalias %p) { ; CHECK-LABEL: define void @main( ; CHECK-SAME: ptr noalias [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: bb: +; CHECK-NEXT: [[CONV_4:%.*]] = zext i32 0 to i64 +; CHECK-NEXT: [[COND_4:%.*]] = tail call i64 @llvm.smax.i64(i64 [[CONV_4]], i64 0) +; CHECK-NEXT: [[CONV5_4:%.*]] = trunc i64 [[COND_4]] to i8 +; CHECK-NEXT: store i8 [[CONV5_4]], ptr getelementptr inbounds ([11 x i8], ptr @e, i64 0, i64 4), align 1 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> , i32 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[TMP1]], <2 x i32> ) -; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i32> [[TMP2]] to <2 x i8> -; CHECK-NEXT: store <2 x i8> [[TMP3]], ptr getelementptr inbounds ([11 x i8], ptr @e, i64 0, i64 4), align 1 +; CHECK-NEXT: [[CONV_5:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[COND_5:%.*]] = tail call i64 @llvm.smax.i64(i64 [[CONV_5]], i64 1) +; CHECK-NEXT: [[CONV5_5:%.*]] = trunc i64 [[COND_5]] to i8 +; CHECK-NEXT: store i8 [[CONV5_5]], ptr getelementptr inbounds ([11 x i8], ptr @e, i64 0, i64 5), align 1 ; CHECK-NEXT: ret void ; bb: From e185978321db4500d72fb1186476ee4104c5928b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 16 Apr 2024 14:23:50 +0100 Subject: [PATCH 102/300] [VectorCombine][X86] Regenerate shuffle.ll + shuffle-of-casts.ll Use v4 of UTC to improve regex matching of argument names to fix a filecheck matching in a future patch --- .../VectorCombine/X86/shuffle-of-casts.ll | 130 +++++++----- .../Transforms/VectorCombine/X86/shuffle.ll | 188 ++++++++++-------- 2 files changed, 185 insertions(+), 133 deletions(-) diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll index 7d9f7e390b9c04..3a5d2095e2b93b 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll @@ -1,12 +1,13 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX ; standard vector concatenations define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) { -; CHECK-LABEL: @concat_zext_v8i16_v16i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> +; CHECK-LABEL: define <16 x i32> @concat_zext_v8i16_v16i32( +; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = zext <16 x i16> [[TMP1]] to <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[R]] ; @@ -17,8 +18,9 @@ define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) { } define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) { -; CHECK-LABEL: @concat_zext_nneg_v8i16_v16i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> +; CHECK-LABEL: define <16 x i32> @concat_zext_nneg_v8i16_v16i32( +; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = zext nneg <16 x i16> [[TMP1]] to <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[R]] ; @@ -29,14 +31,16 @@ define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) { } define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: @concat_sext_zext_nneg_v8i16_v8i32( -; SSE-NEXT: [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32> -; SSE-NEXT: [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32> +; SSE-LABEL: define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32( +; SSE-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] { +; SSE-NEXT: [[X0:%.*]] = sext <8 x i16> [[A0]] to <8 x i32> +; SSE-NEXT: [[X1:%.*]] = zext nneg <8 x i16> [[A1]] to <8 x i32> ; SSE-NEXT: [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> ; SSE-NEXT: ret <16 x i32> [[R]] ; -; AVX-LABEL: @concat_sext_zext_nneg_v8i16_v8i32( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> +; AVX-LABEL: define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32( +; AVX-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] { +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> ; AVX-NEXT: [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32> ; AVX-NEXT: ret <16 x i32> [[R]] ; @@ -47,8 +51,9 @@ define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a } define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) { -; CHECK-LABEL: @concat_sext_v8i16_v16i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> +; CHECK-LABEL: define <16 x i32> @concat_sext_v8i16_v16i32( +; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[R]] ; @@ -59,8 +64,9 @@ define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) { } define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) { -; CHECK-LABEL: @concat_sext_v4i1_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]], <8 x i32> +; CHECK-LABEL: define <8 x i32> @concat_sext_v4i1_v8i32( +; CHECK-SAME: <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[A0]], <4 x i1> [[A1]], <8 x i32> ; CHECK-NEXT: [[R:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; @@ -71,8 +77,9 @@ define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) { } define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: @concat_trunc_v4i32_v8i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> +; CHECK-LABEL: define <8 x i16> @concat_trunc_v4i32_v8i16( +; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> ; CHECK-NEXT: [[R:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16> ; CHECK-NEXT: ret <8 x i16> [[R]] ; @@ -83,8 +90,9 @@ define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) { } define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: @concat_inttoptr_v4i32_v8iptr( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> +; CHECK-LABEL: define <8 x ptr> @concat_inttoptr_v4i32_v8iptr( +; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> ; CHECK-NEXT: [[R:%.*]] = inttoptr <8 x i32> [[TMP1]] to <8 x ptr> ; CHECK-NEXT: ret <8 x ptr> [[R]] ; @@ -95,8 +103,9 @@ define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) { } define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) { -; CHECK-LABEL: @concat_ptrtoint_v8i16_v16i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[A0:%.*]], <8 x ptr> [[A1:%.*]], <16 x i32> +; CHECK-LABEL: define <16 x i64> @concat_ptrtoint_v8i16_v16i32( +; CHECK-SAME: <8 x ptr> [[A0:%.*]], <8 x ptr> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[A0]], <8 x ptr> [[A1]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = ptrtoint <16 x ptr> [[TMP1]] to <16 x i64> ; CHECK-NEXT: ret <16 x i64> [[R]] ; @@ -107,14 +116,16 @@ define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) { } define <8 x double> @concat_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) { -; SSE-LABEL: @concat_fpext_v4f32_v8f64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> +; SSE-LABEL: define <8 x double> @concat_fpext_v4f32_v8f64( +; SSE-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] { +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> ; SSE-NEXT: [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double> ; SSE-NEXT: ret <8 x double> [[R]] ; -; AVX-LABEL: @concat_fpext_v4f32_v8f64( -; AVX-NEXT: [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double> -; AVX-NEXT: [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double> +; AVX-LABEL: define <8 x double> @concat_fpext_v4f32_v8f64( +; AVX-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] { +; AVX-NEXT: [[X0:%.*]] = fpext <4 x float> [[A0]] to <4 x double> +; AVX-NEXT: [[X1:%.*]] = fpext <4 x float> [[A1]] to <4 x double> ; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> ; AVX-NEXT: ret <8 x double> [[R]] ; @@ -125,9 +136,10 @@ define <8 x double> @concat_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) } define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double> %a1) { -; CHECK-LABEL: @concat_fptrunc_v8f64_v16f32( -; CHECK-NEXT: [[X0:%.*]] = fptrunc <8 x double> [[A0:%.*]] to <8 x float> -; CHECK-NEXT: [[X1:%.*]] = fptrunc <8 x double> [[A1:%.*]] to <8 x float> +; CHECK-LABEL: define <16 x float> @concat_fptrunc_v8f64_v16f32( +; CHECK-SAME: <8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X0:%.*]] = fptrunc <8 x double> [[A0]] to <8 x float> +; CHECK-NEXT: [[X1:%.*]] = fptrunc <8 x double> [[A1]] to <8 x float> ; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x float> [[X0]], <8 x float> [[X1]], <16 x i32> ; CHECK-NEXT: ret <16 x float> [[R]] ; @@ -140,8 +152,9 @@ define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double> ; commuted vector concatenation define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) { -; CHECK-LABEL: @rconcat_sext_v8i16_v16i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> +; CHECK-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32( +; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[R]] ; @@ -154,8 +167,9 @@ define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) { ; interleaved shuffle define <8 x double> @interleave_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) { -; CHECK-LABEL: @interleave_fpext_v4f32_v8f64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> +; CHECK-LABEL: define <8 x double> @interleave_fpext_v4f32_v8f64( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> ; CHECK-NEXT: [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double> ; CHECK-NEXT: ret <8 x double> [[R]] ; @@ -168,8 +182,9 @@ define <8 x double> @interleave_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> % ; bitcasts (same element count) define <8 x float> @concat_bitcast_v4i32_v8f32(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: @concat_bitcast_v4i32_v8f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> +; CHECK-LABEL: define <8 x float> @concat_bitcast_v4i32_v8f32( +; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> ; CHECK-NEXT: [[R:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float> ; CHECK-NEXT: ret <8 x float> [[R]] ; @@ -182,8 +197,9 @@ define <8 x float> @concat_bitcast_v4i32_v8f32(<4 x i32> %a0, <4 x i32> %a1) { ; bitcasts (lower element count) define <4 x double> @concat_bitcast_v8i16_v4f64(<8 x i16> %a0, <8 x i16> %a1) { -; CHECK-LABEL: @concat_bitcast_v8i16_v4f64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> +; CHECK-LABEL: define <4 x double> @concat_bitcast_v8i16_v4f64( +; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> ; CHECK-NEXT: [[R:%.*]] = bitcast <16 x i16> [[TMP1]] to <4 x double> ; CHECK-NEXT: ret <4 x double> [[R]] ; @@ -196,8 +212,9 @@ define <4 x double> @concat_bitcast_v8i16_v4f64(<8 x i16> %a0, <8 x i16> %a1) { ; bitcasts (higher element count) define <16 x i16> @concat_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: @concat_bitcast_v4i32_v16i16( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> +; CHECK-LABEL: define <16 x i16> @concat_bitcast_v4i32_v16i16( +; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> ; CHECK-NEXT: [[R:%.*]] = bitcast <8 x i32> [[TMP1]] to <16 x i16> ; CHECK-NEXT: ret <16 x i16> [[R]] ; @@ -210,11 +227,12 @@ define <16 x i16> @concat_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) { ; negative - multiuse define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1, ptr %a2) { -; CHECK-LABEL: @concat_trunc_v4i32_v8i16_multiuse( -; CHECK-NEXT: [[X0:%.*]] = trunc <4 x i32> [[A0:%.*]] to <4 x i16> -; CHECK-NEXT: [[X1:%.*]] = trunc <4 x i32> [[A1:%.*]] to <4 x i16> +; CHECK-LABEL: define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse( +; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], ptr [[A2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X0:%.*]] = trunc <4 x i32> [[A0]] to <4 x i16> +; CHECK-NEXT: [[X1:%.*]] = trunc <4 x i32> [[A1]] to <4 x i16> ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[X0]], <4 x i16> [[X1]], <8 x i32> -; CHECK-NEXT: store <4 x i16> [[X0]], ptr [[A2:%.*]], align 8 +; CHECK-NEXT: store <4 x i16> [[X0]], ptr [[A2]], align 8 ; CHECK-NEXT: ret <8 x i16> [[R]] ; %x0 = trunc <4 x i32> %a0 to <4 x i16> @@ -227,9 +245,10 @@ define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1 ; negative - bitcasts (unscalable higher element count) define <16 x i16> @revpair_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) { -; CHECK-LABEL: @revpair_bitcast_v4i32_v16i16( -; CHECK-NEXT: [[X0:%.*]] = bitcast <4 x i32> [[A0:%.*]] to <8 x i16> -; CHECK-NEXT: [[X1:%.*]] = bitcast <4 x i32> [[A1:%.*]] to <8 x i16> +; CHECK-LABEL: define <16 x i16> @revpair_bitcast_v4i32_v16i16( +; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X0:%.*]] = bitcast <4 x i32> [[A0]] to <8 x i16> +; CHECK-NEXT: [[X1:%.*]] = bitcast <4 x i32> [[A1]] to <8 x i16> ; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[X0]], <8 x i16> [[X1]], <16 x i32> ; CHECK-NEXT: ret <16 x i16> [[R]] ; @@ -242,9 +261,10 @@ define <16 x i16> @revpair_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) { ; negative - bitcasts (unscalable element counts) define <4 x i32> @shuffle_bitcast_v32i40_v4i32(<32 x i40> %a0, <32 x i40> %a1) { -; CHECK-LABEL: @shuffle_bitcast_v32i40_v4i32( -; CHECK-NEXT: [[X0:%.*]] = bitcast <32 x i40> [[A0:%.*]] to <40 x i32> -; CHECK-NEXT: [[X1:%.*]] = bitcast <32 x i40> [[A1:%.*]] to <40 x i32> +; CHECK-LABEL: define <4 x i32> @shuffle_bitcast_v32i40_v4i32( +; CHECK-SAME: <32 x i40> [[A0:%.*]], <32 x i40> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X0:%.*]] = bitcast <32 x i40> [[A0]] to <40 x i32> +; CHECK-NEXT: [[X1:%.*]] = bitcast <32 x i40> [[A1]] to <40 x i32> ; CHECK-NEXT: [[R:%.*]] = shufflevector <40 x i32> [[X0]], <40 x i32> [[X1]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; @@ -257,9 +277,10 @@ define <4 x i32> @shuffle_bitcast_v32i40_v4i32(<32 x i40> %a0, <32 x i40> %a1) { ; negative - src type mismatch define <8 x i32> @concat_sext_v4i8_v4i16_v8i32(<4 x i8> %a0, <4 x i16> %a1) { -; CHECK-LABEL: @concat_sext_v4i8_v4i16_v8i32( -; CHECK-NEXT: [[X0:%.*]] = sext <4 x i8> [[A0:%.*]] to <4 x i32> -; CHECK-NEXT: [[X1:%.*]] = sext <4 x i16> [[A1:%.*]] to <4 x i32> +; CHECK-LABEL: define <8 x i32> @concat_sext_v4i8_v4i16_v8i32( +; CHECK-SAME: <4 x i8> [[A0:%.*]], <4 x i16> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X0:%.*]] = sext <4 x i8> [[A0]] to <4 x i32> +; CHECK-NEXT: [[X1:%.*]] = sext <4 x i16> [[A1]] to <4 x i32> ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X1]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; @@ -272,9 +293,10 @@ define <8 x i32> @concat_sext_v4i8_v4i16_v8i32(<4 x i8> %a0, <4 x i16> %a1) { ; negative - castop mismatch define <16 x i32> @concat_sext_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) { -; CHECK-LABEL: @concat_sext_zext_v8i16_v16i32( -; CHECK-NEXT: [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32> -; CHECK-NEXT: [[X1:%.*]] = zext <8 x i16> [[A1:%.*]] to <8 x i32> +; CHECK-LABEL: define <16 x i32> @concat_sext_zext_v8i16_v16i32( +; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X0:%.*]] = sext <8 x i16> [[A0]] to <8 x i32> +; CHECK-NEXT: [[X1:%.*]] = zext <8 x i16> [[A1]] to <8 x i32> ; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> ; CHECK-NEXT: ret <16 x i32> [[R]] ; diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll index 8337bb37bc549d..bb6402347a9b45 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll @@ -1,17 +1,19 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX ; x86 does not have a cheap v16i8 shuffle until SSSE3 (pshufb) define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) { -; SSE-LABEL: @bitcast_shuf_narrow_element( -; SSE-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> +; SSE-LABEL: define <16 x i8> @bitcast_shuf_narrow_element( +; SSE-SAME: <4 x i32> [[V:%.*]]) #[[ATTR0:[0-9]+]] { +; SSE-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V]], <4 x i32> poison, <4 x i32> ; SSE-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> ; SSE-NEXT: ret <16 x i8> [[R]] ; -; AVX-LABEL: @bitcast_shuf_narrow_element( -; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +; AVX-LABEL: define <16 x i8> @bitcast_shuf_narrow_element( +; AVX-SAME: <4 x i32> [[V:%.*]]) #[[ATTR0:[0-9]+]] { +; AVX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V]] to <16 x i8> ; AVX-NEXT: [[R:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> ; AVX-NEXT: ret <16 x i8> [[R]] ; @@ -23,8 +25,9 @@ define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) { ; v4f32 is the same cost as v4i32, so this always works define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) { -; CHECK-LABEL: @bitcast_shuf_same_size( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <4 x float> +; CHECK-LABEL: define <4 x float> @bitcast_shuf_same_size( +; CHECK-SAME: <4 x i32> [[V:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V]] to <4 x float> ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; @@ -36,13 +39,15 @@ define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) { ; Length-changing shuffles define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) { -; SSE-LABEL: @bitcast_shuf_narrow_element_subvector( -; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> +; SSE-LABEL: define <16 x i8> @bitcast_shuf_narrow_element_subvector( +; SSE-SAME: <2 x i32> [[V:%.*]]) #[[ATTR0]] { +; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[V]], <2 x i32> poison, <4 x i32> ; SSE-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> ; SSE-NEXT: ret <16 x i8> [[R]] ; -; AVX-LABEL: @bitcast_shuf_narrow_element_subvector( -; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +; AVX-LABEL: define <16 x i8> @bitcast_shuf_narrow_element_subvector( +; AVX-SAME: <2 x i32> [[V:%.*]]) #[[ATTR0]] { +; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V]] to <8 x i8> ; AVX-NEXT: [[R:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> ; AVX-NEXT: ret <16 x i8> [[R]] ; @@ -52,13 +57,15 @@ define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) { } define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) { -; SSE-LABEL: @bitcast_shuf_narrow_element_concat_subvectors( -; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> +; SSE-LABEL: define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors( +; SSE-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] { +; SSE-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <4 x i32> ; SSE-NEXT: [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16> ; SSE-NEXT: ret <16 x i16> [[R]] ; -; AVX-LABEL: @bitcast_shuf_narrow_element_concat_subvectors( -; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16> +; AVX-LABEL: define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors( +; AVX-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] { +; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V]] to <8 x i16> ; AVX-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> ; AVX-NEXT: ret <16 x i16> [[R]] ; @@ -68,8 +75,9 @@ define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) { } define <16 x i8> @bitcast_shuf_extract_subvector(<8 x i32> %v) { -; CHECK-LABEL: @bitcast_shuf_extract_subvector( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[V:%.*]] to <32 x i8> +; CHECK-LABEL: define <16 x i8> @bitcast_shuf_extract_subvector( +; CHECK-SAME: <8 x i32> [[V:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[V]] to <32 x i8> ; CHECK-NEXT: [[R:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <16 x i32> ; CHECK-NEXT: ret <16 x i8> [[R]] ; @@ -81,8 +89,9 @@ define <16 x i8> @bitcast_shuf_extract_subvector(<8 x i32> %v) { ; Negative test - must cast to vector type define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) { -; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_type( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> +; CHECK-LABEL: define i128 @bitcast_shuf_narrow_element_wrong_type( +; CHECK-SAME: <4 x i32> [[V:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to i128 ; CHECK-NEXT: ret i128 [[R]] ; @@ -94,8 +103,9 @@ define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) { ; Widen shuffle elements define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) { -; CHECK-LABEL: @bitcast_shuf_wide_element( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32> +; CHECK-LABEL: define <4 x i32> @bitcast_shuf_wide_element( +; CHECK-SAME: <8 x i16> [[V:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V]] to <4 x i32> ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; @@ -109,8 +119,9 @@ declare void @use(<4 x i32>) ; Negative test - don't create an extra shuffle define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) { -; CHECK-LABEL: @bitcast_shuf_uses( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> +; CHECK-LABEL: define <16 x i8> @bitcast_shuf_uses( +; CHECK-SAME: <4 x i32> [[V:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @use(<4 x i32> [[SHUF]]) ; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> ; CHECK-NEXT: ret <16 x i8> [[R]] @@ -125,8 +136,9 @@ define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) { ; TODO - can we remove the empty bitcast(bitcast()) ? define <4 x i64> @bitcast_shuf_remove_bitcasts(<2 x i64> %a0, <2 x i64> %a1) { -; CHECK-LABEL: @bitcast_shuf_remove_bitcasts( -; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i32> +; CHECK-LABEL: define <4 x i64> @bitcast_shuf_remove_bitcasts( +; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i64> [[A0]], <2 x i64> [[A1]], <4 x i32> ; CHECK-NEXT: [[SHUF:%.*]] = bitcast <4 x i64> [[R]] to <8 x i32> ; CHECK-NEXT: [[R1:%.*]] = bitcast <8 x i32> [[SHUF]] to <4 x i64> ; CHECK-NEXT: ret <4 x i64> [[R1]] @@ -141,9 +153,10 @@ define <4 x i64> @bitcast_shuf_remove_bitcasts(<2 x i64> %a0, <2 x i64> %a1) { ; shuffle of 2 operands must reduce bitcasts define <8 x i32> @bitcast_shuf_one_bitcast(<4 x i32> %a0, <2 x i64> %a1) { -; CHECK-LABEL: @bitcast_shuf_one_bitcast( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[A1:%.*]] to <4 x i32> -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[TMP1]], <8 x i32> +; CHECK-LABEL: define <8 x i32> @bitcast_shuf_one_bitcast( +; CHECK-SAME: <4 x i32> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[A1]] to <4 x i32> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[TMP1]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %bc0 = bitcast <4 x i32> %a0 to <2 x i64> @@ -155,8 +168,9 @@ define <8 x i32> @bitcast_shuf_one_bitcast(<4 x i32> %a0, <2 x i64> %a1) { ; Negative test - shuffle of 2 operands must not increase bitcasts define <8 x i32> @bitcast_shuf_too_many_bitcasts(<2 x i64> %a0, <2 x i64> %a1) { -; CHECK-LABEL: @bitcast_shuf_too_many_bitcasts( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i32> +; CHECK-LABEL: define <8 x i32> @bitcast_shuf_too_many_bitcasts( +; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i64> [[A0]], <2 x i64> [[A1]], <4 x i32> ; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; @@ -166,8 +180,9 @@ define <8 x i32> @bitcast_shuf_too_many_bitcasts(<2 x i64> %a0, <2 x i64> %a1) { } define <2 x i64> @PR35454_1(<2 x i64> %v) { -; SSE-LABEL: @PR35454_1( -; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; SSE-LABEL: define <2 x i64> @PR35454_1( +; SSE-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] { +; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V]] to <4 x i32> ; SSE-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> poison, <4 x i32> ; SSE-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8> ; SSE-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], @@ -176,8 +191,9 @@ define <2 x i64> @PR35454_1(<2 x i64> %v) { ; SSE-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> ; SSE-NEXT: ret <2 x i64> [[BC3]] ; -; AVX-LABEL: @PR35454_1( -; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <16 x i8> +; AVX-LABEL: define <2 x i64> @PR35454_1( +; AVX-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] { +; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V]] to <16 x i8> ; AVX-NEXT: [[BC1:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> ; AVX-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], ; AVX-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> @@ -196,8 +212,9 @@ define <2 x i64> @PR35454_1(<2 x i64> %v) { } define <2 x i64> @PR35454_2(<2 x i64> %v) { -; SSE-LABEL: @PR35454_2( -; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; SSE-LABEL: define <2 x i64> @PR35454_2( +; SSE-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] { +; SSE-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V]] to <4 x i32> ; SSE-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> poison, <4 x i32> ; SSE-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16> ; SSE-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], @@ -206,8 +223,9 @@ define <2 x i64> @PR35454_2(<2 x i64> %v) { ; SSE-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> ; SSE-NEXT: ret <2 x i64> [[BC3]] ; -; AVX-LABEL: @PR35454_2( -; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16> +; AVX-LABEL: define <2 x i64> @PR35454_2( +; AVX-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] { +; AVX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V]] to <8 x i16> ; AVX-NEXT: [[BC1:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> ; AVX-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], ; AVX-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> @@ -228,10 +246,11 @@ define <2 x i64> @PR35454_2(<2 x i64> %v) { ; Shuffle is much cheaper than fdiv. FMF are intersected. define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x float> %z) { -; CHECK-LABEL: @shuf_fdiv_v4f32_yy( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> [[Z:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = fdiv arcp <4 x float> [[TMP1]], [[TMP2]] +; CHECK-LABEL: define <4 x float> @shuf_fdiv_v4f32_yy( +; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Z]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = fdiv arcp <4 x float> [[TMP2]], [[TMP3]] ; CHECK-NEXT: ret <4 x float> [[R]] ; %b0 = fdiv fast <4 x float> %x, %y @@ -243,11 +262,12 @@ define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x floa ; Common operand is op0 of the binops. define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: @shuf_add_v4i32_xx( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> -; CHECK-NEXT: [[R:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i32> [[R]] +; CHECK-LABEL: define <4 x i32> @shuf_add_v4i32_xx( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> +; CHECK-NEXT: [[R2:%.*]] = add <4 x i32> [[TMP1]], [[R1]] +; CHECK-NEXT: ret <4 x i32> [[R2]] ; %b0 = add <4 x i32> %x, %y %b1 = add <4 x i32> %x, %z @@ -258,9 +278,10 @@ define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; For commutative instructions, common operand may be swapped. define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) { -; CHECK-LABEL: @shuf_fmul_v4f32_xx_swap( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> +; CHECK-LABEL: define <4 x float> @shuf_fmul_v4f32_xx_swap( +; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <4 x float> [[R]] ; @@ -273,9 +294,10 @@ define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x ; For commutative instructions, common operand may be swapped. define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { -; CHECK-LABEL: @shuf_and_v2i64_yy_swap( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[Y:%.*]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[X:%.*]], <2 x i64> [[Z:%.*]], <2 x i32> +; CHECK-LABEL: define <2 x i64> @shuf_and_v2i64_yy_swap( +; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> ; CHECK-NEXT: [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <2 x i64> [[R]] ; @@ -288,9 +310,10 @@ define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; non-commutative binop, but common op0 define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: @shuf_shl_v4i32_xx( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> +; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> ; CHECK-NEXT: [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <4 x i32> [[R]] ; @@ -303,11 +326,12 @@ define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { ; negative test - common operand, but not commutable define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: @shuf_shl_v4i32_xx_swap( -; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = shl <4 x i32> [[Z:%.*]], [[X]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] +; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = shl <4 x i32> [[Z]], [[X]] +; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[R1]] ; %b0 = shl <4 x i32> %x, %y %b1 = shl <4 x i32> %z, %x @@ -318,9 +342,10 @@ define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; negative test - mismatched opcodes define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { -; CHECK-LABEL: @shuf_sub_add_v2i64_yy( -; CHECK-NEXT: [[B0:%.*]] = sub <2 x i64> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = add <2 x i64> [[Z:%.*]], [[Y]] +; CHECK-LABEL: define <2 x i64> @shuf_sub_add_v2i64_yy( +; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = sub <2 x i64> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = add <2 x i64> [[Z]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> ; CHECK-NEXT: ret <2 x i64> [[R]] ; @@ -333,9 +358,10 @@ define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z ; negative test - type change via shuffle define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) { -; CHECK-LABEL: @shuf_fmul_v4f32_xx_type( -; CHECK-NEXT: [[B0:%.*]] = fmul <4 x float> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = fmul <4 x float> [[Z:%.*]], [[X]] +; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type( +; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]] ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> ; CHECK-NEXT: ret <8 x float> [[R]] ; @@ -348,10 +374,11 @@ define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x ; negative test - uses define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: @shuf_lshr_v4i32_yy_use1( -; CHECK-NEXT: [[B0:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]] +; CHECK-LABEL: define <4 x i32> @shuf_lshr_v4i32_yy_use1( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = lshr <4 x i32> [[X]], [[Y]] ; CHECK-NEXT: call void @use(<4 x i32> [[B0]]) -; CHECK-NEXT: [[B1:%.*]] = lshr <4 x i32> [[Z:%.*]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = lshr <4 x i32> [[Z]], [[Y]] ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; @@ -365,9 +392,10 @@ define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32> ; negative test - uses define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: @shuf_mul_v4i32_yy_use2( -; CHECK-NEXT: [[B0:%.*]] = mul <4 x i32> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = mul <4 x i32> [[Z:%.*]], [[Y]] +; CHECK-LABEL: define <4 x i32> @shuf_mul_v4i32_yy_use2( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = mul <4 x i32> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = mul <4 x i32> [[Z]], [[Y]] ; CHECK-NEXT: call void @use(<4 x i32> [[B1]]) ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] @@ -382,9 +410,10 @@ define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; negative test - must have matching operand define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) { -; CHECK-LABEL: @shuf_fadd_v4f32_no_common_op( -; CHECK-NEXT: [[B0:%.*]] = fadd <4 x float> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = fadd <4 x float> [[Z:%.*]], [[W:%.*]] +; CHECK-LABEL: define <4 x float> @shuf_fadd_v4f32_no_common_op( +; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = fadd <4 x float> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = fadd <4 x float> [[Z]], [[W]] ; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; @@ -397,9 +426,10 @@ define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, ; negative test - binops may be relatively cheap define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf(<16 x i16> %x, <16 x i16> %y, <16 x i16> %z) { -; CHECK-LABEL: @shuf_and_v16i16_yy_expensive_shuf( -; CHECK-NEXT: [[B0:%.*]] = and <16 x i16> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[B1:%.*]] = and <16 x i16> [[Y]], [[Z:%.*]] +; CHECK-LABEL: define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf( +; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]], <16 x i16> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = and <16 x i16> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = and <16 x i16> [[Y]], [[Z]] ; CHECK-NEXT: [[R:%.*]] = shufflevector <16 x i16> [[B0]], <16 x i16> [[B1]], <16 x i32> ; CHECK-NEXT: ret <16 x i16> [[R]] ; From e84b2fb48df882ee1f49bbbca29d44fba22192a8 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 16 Apr 2024 09:48:13 -0400 Subject: [PATCH 103/300] [LV][NFCI]Use integer for cost/trip count calculations instead of double, fix possible UB. Using fp type in the compiler is not the best idea, here it used with the comparison for equal to 0 and may cause undefined behavior in some cases. Reviewers: fhahn Reviewed By: fhahn Pull Request: https://github.com/llvm/llvm-project/pull/87241 --- .../Transforms/Vectorize/LoopVectorize.cpp | 14 ++--- .../vectorize-force-tail-with-evl.ll | 53 +++++++++++++++++++ 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5535cc55e93216..2057cab46135ff 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9722,7 +9722,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, } // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. - double ScalarC = *VF.ScalarCost.getValue(); + uint64_t ScalarC = *VF.ScalarCost.getValue(); if (ScalarC == 0) return true; @@ -9749,7 +9749,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC // // Now we can compute the minimum required trip count TC as - // (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC + // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC // // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result @@ -9761,9 +9761,9 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, AssumedMinimumVscale = *VScale; IntVF *= AssumedMinimumVscale; } - double VecCOverVF = double(*VF.Cost.getValue()) / IntVF; - double RtC = *CheckCost.getValue(); - double MinTC1 = RtC / (ScalarC - VecCOverVF); + uint64_t RtC = *CheckCost.getValue(); + uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); + uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); // Second, compute a minimum iteration count so that the cost of the // runtime checks is only a fraction of the total scalar loop cost. This @@ -9772,12 +9772,12 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // * TC. To bound the runtime check to be a fraction 1/X of the scalar // cost, compute // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC - double MinTC2 = RtC * 10 / ScalarC; + uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC); // Now pick the larger minimum. If it is not a multiple of VF and a scalar // epilogue is allowed, choose the next closest multiple of VF. This should // partly compensate for ignoring the epilogue cost. - uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2)); + uint64_t MinTC = std::max(MinTC1, MinTC2); if (SEL == CM_ScalarEpilogueAllowed) MinTC = alignTo(MinTC, IntVF); VF.MinProfitableTripCount = ElementCount::getFixed(MinTC); diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll index a90b38c6a96056..fe98ca167a089e 100644 --- a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll @@ -9,6 +9,11 @@ ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-width=4 \ ; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=NO-VP %s +; RUN: opt -passes=loop-vectorize \ +; RUN: -force-tail-folding-style=none \ +; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ +; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=NO-VP-DEF %s + ; The target does not support predicated vectorization. define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-LABEL: @foo( @@ -80,6 +85,54 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; NO-VP: for.cond.cleanup: ; NO-VP-NEXT: ret void ; +; NO-VP-DEF-LABEL: @foo( +; NO-VP-DEF-NEXT: entry: +; NO-VP-DEF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-DEF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP0]] +; NO-VP-DEF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP-DEF: vector.ph: +; NO-VP-DEF-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-DEF-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP1]] +; NO-VP-DEF-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-VP-DEF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NO-VP-DEF-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP-DEF: vector.body: +; NO-VP-DEF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-DEF-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-DEF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]] +; NO-VP-DEF-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 +; NO-VP-DEF-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 +; NO-VP-DEF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]] +; NO-VP-DEF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 +; NO-VP-DEF-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP7]], align 4 +; NO-VP-DEF-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_LOAD1]], [[WIDE_LOAD]] +; NO-VP-DEF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] +; NO-VP-DEF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 +; NO-VP-DEF-NEXT: store [[TMP8]], ptr [[TMP10]], align 4 +; NO-VP-DEF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; NO-VP-DEF-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-DEF-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP-DEF: middle.block: +; NO-VP-DEF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-VP-DEF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; NO-VP-DEF: scalar.ph: +; NO-VP-DEF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; NO-VP-DEF-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP-DEF: for.body: +; NO-VP-DEF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-DEF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; NO-VP-DEF-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; NO-VP-DEF-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; NO-VP-DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; NO-VP-DEF-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; NO-VP-DEF-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] +; NO-VP-DEF-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 +; NO-VP-DEF-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; NO-VP-DEF-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; NO-VP-DEF-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-VP-DEF: for.cond.cleanup: +; NO-VP-DEF-NEXT: ret void +; entry: br label %for.body From 254df2e35c14414564d233902b25305ed17d251c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 16 Apr 2024 14:52:16 +0100 Subject: [PATCH 104/300] [VectorCombine][X86] shuffle-of-binops.ll - split off foldShuffleOfBinops tests from shuffle.ll --- .../VectorCombine/X86/shuffle-of-binops.ll | 204 ++++++++++++++++++ .../Transforms/VectorCombine/X86/shuffle.ll | 200 +---------------- 2 files changed, 206 insertions(+), 198 deletions(-) create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll new file mode 100644 index 00000000000000..e2ff343944cf2a --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll @@ -0,0 +1,204 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX + +declare void @use(<4 x i32>) + +; Shuffle is much cheaper than fdiv. FMF are intersected. + +define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x float> %z) { +; CHECK-LABEL: define <4 x float> @shuf_fdiv_v4f32_yy( +; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Z]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = fdiv arcp <4 x float> [[TMP2]], [[TMP3]] +; CHECK-NEXT: ret <4 x float> [[R]] +; + %b0 = fdiv fast <4 x float> %x, %y + %b1 = fdiv arcp <4 x float> %z, %y + %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> + ret <4 x float> %r +} + +; Common operand is op0 of the binops. + +define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { +; CHECK-LABEL: define <4 x i32> @shuf_add_v4i32_xx( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> +; CHECK-NEXT: [[R2:%.*]] = add <4 x i32> [[TMP1]], [[R1]] +; CHECK-NEXT: ret <4 x i32> [[R2]] +; + %b0 = add <4 x i32> %x, %y + %b1 = add <4 x i32> %x, %z + %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> + ret <4 x i32> %r +} + +; For commutative instructions, common operand may be swapped. + +define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) { +; CHECK-LABEL: define <4 x float> @shuf_fmul_v4f32_xx_swap( +; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <4 x float> [[R]] +; + %b0 = fmul <4 x float> %x, %y + %b1 = fmul <4 x float> %z, %x + %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> + ret <4 x float> %r +} + +; For commutative instructions, common operand may be swapped. + +define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { +; CHECK-LABEL: define <2 x i64> @shuf_and_v2i64_yy_swap( +; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> +; CHECK-NEXT: [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %b0 = and <2 x i64> %x, %y + %b1 = and <2 x i64> %y, %z + %r = shufflevector <2 x i64> %b0, <2 x i64> %b1, <2 x i32> + ret <2 x i64> %r +} + +; non-commutative binop, but common op0 + +define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { +; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> +; CHECK-NEXT: [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %b0 = shl <4 x i32> %x, %y + %b1 = shl <4 x i32> %x, %z + %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> + ret <4 x i32> %r +} + +; negative test - common operand, but not commutable + +define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { +; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = shl <4 x i32> [[Z]], [[X]] +; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[R1]] +; + %b0 = shl <4 x i32> %x, %y + %b1 = shl <4 x i32> %z, %x + %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> + ret <4 x i32> %r +} + +; negative test - mismatched opcodes + +define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { +; CHECK-LABEL: define <2 x i64> @shuf_sub_add_v2i64_yy( +; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = sub <2 x i64> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = add <2 x i64> [[Z]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> +; CHECK-NEXT: ret <2 x i64> [[R]] +; + %b0 = sub <2 x i64> %x, %y + %b1 = add <2 x i64> %z, %y + %r = shufflevector <2 x i64> %b0, <2 x i64> %b1, <2 x i32> + ret <2 x i64> %r +} + +; negative test - type change via shuffle + +define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) { +; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type( +; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]] +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> +; CHECK-NEXT: ret <8 x float> [[R]] +; + %b0 = fmul <4 x float> %x, %y + %b1 = fmul <4 x float> %z, %x + %r = shufflevector <4 x float> %b0, <4 x float> %b1, <8 x i32> + ret <8 x float> %r +} + +; negative test - uses + +define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { +; CHECK-LABEL: define <4 x i32> @shuf_lshr_v4i32_yy_use1( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = lshr <4 x i32> [[X]], [[Y]] +; CHECK-NEXT: call void @use(<4 x i32> [[B0]]) +; CHECK-NEXT: [[B1:%.*]] = lshr <4 x i32> [[Z]], [[Y]] +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %b0 = lshr <4 x i32> %x, %y + call void @use(<4 x i32> %b0) + %b1 = lshr <4 x i32> %z, %y + %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> + ret <4 x i32> %r +} + +; negative test - uses + +define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { +; CHECK-LABEL: define <4 x i32> @shuf_mul_v4i32_yy_use2( +; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = mul <4 x i32> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = mul <4 x i32> [[Z]], [[Y]] +; CHECK-NEXT: call void @use(<4 x i32> [[B1]]) +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %b0 = mul <4 x i32> %x, %y + %b1 = mul <4 x i32> %z, %y + call void @use(<4 x i32> %b1) + %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> + ret <4 x i32> %r +} + +; negative test - must have matching operand + +define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) { +; CHECK-LABEL: define <4 x float> @shuf_fadd_v4f32_no_common_op( +; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = fadd <4 x float> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = fadd <4 x float> [[Z]], [[W]] +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> +; CHECK-NEXT: ret <4 x float> [[R]] +; + %b0 = fadd <4 x float> %x, %y + %b1 = fadd <4 x float> %z, %w + %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> + ret <4 x float> %r +} + +; negative test - binops may be relatively cheap + +define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf(<16 x i16> %x, <16 x i16> %y, <16 x i16> %z) { +; CHECK-LABEL: define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf( +; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]], <16 x i16> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[B0:%.*]] = and <16 x i16> [[X]], [[Y]] +; CHECK-NEXT: [[B1:%.*]] = and <16 x i16> [[Y]], [[Z]] +; CHECK-NEXT: [[R:%.*]] = shufflevector <16 x i16> [[B0]], <16 x i16> [[B1]], <16 x i32> +; CHECK-NEXT: ret <16 x i16> [[R]] +; + %b0 = and <16 x i16> %x, %y + %b1 = and <16 x i16> %y, %z + %r = shufflevector <16 x i16> %b0, <16 x i16> %b1, <16 x i32> + ret <16 x i16> %r +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX: {{.*}} +; SSE: {{.*}} diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll index bb6402347a9b45..c8c9aa161ae289 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll @@ -2,6 +2,8 @@ ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX +declare void @use(<4 x i32>) + ; x86 does not have a cheap v16i8 shuffle until SSSE3 (pshufb) define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) { @@ -114,8 +116,6 @@ define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) { ret <4 x i32> %r } -declare void @use(<4 x i32>) - ; Negative test - don't create an extra shuffle define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) { @@ -242,199 +242,3 @@ define <2 x i64> @PR35454_2(<2 x i64> %v) { %bc3 = bitcast <4 x i32> %permil1 to <2 x i64> ret <2 x i64> %bc3 } - -; Shuffle is much cheaper than fdiv. FMF are intersected. - -define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x float> %z) { -; CHECK-LABEL: define <4 x float> @shuf_fdiv_v4f32_yy( -; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Z]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = fdiv arcp <4 x float> [[TMP2]], [[TMP3]] -; CHECK-NEXT: ret <4 x float> [[R]] -; - %b0 = fdiv fast <4 x float> %x, %y - %b1 = fdiv arcp <4 x float> %z, %y - %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> - ret <4 x float> %r -} - -; Common operand is op0 of the binops. - -define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: define <4 x i32> @shuf_add_v4i32_xx( -; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> -; CHECK-NEXT: [[R2:%.*]] = add <4 x i32> [[TMP1]], [[R1]] -; CHECK-NEXT: ret <4 x i32> [[R2]] -; - %b0 = add <4 x i32> %x, %y - %b1 = add <4 x i32> %x, %z - %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> - ret <4 x i32> %r -} - -; For commutative instructions, common operand may be swapped. - -define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) { -; CHECK-LABEL: define <4 x float> @shuf_fmul_v4f32_xx_swap( -; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x float> [[R]] -; - %b0 = fmul <4 x float> %x, %y - %b1 = fmul <4 x float> %z, %x - %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> - ret <4 x float> %r -} - -; For commutative instructions, common operand may be swapped. - -define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { -; CHECK-LABEL: define <2 x i64> @shuf_and_v2i64_yy_swap( -; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> -; CHECK-NEXT: [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <2 x i64> [[R]] -; - %b0 = and <2 x i64> %x, %y - %b1 = and <2 x i64> %y, %z - %r = shufflevector <2 x i64> %b0, <2 x i64> %b1, <2 x i32> - ret <2 x i64> %r -} - -; non-commutative binop, but common op0 - -define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx( -; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> -; CHECK-NEXT: [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: ret <4 x i32> [[R]] -; - %b0 = shl <4 x i32> %x, %y - %b1 = shl <4 x i32> %x, %z - %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> - ret <4 x i32> %r -} - -; negative test - common operand, but not commutable - -define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap( -; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[B0:%.*]] = shl <4 x i32> [[X]], [[Y]] -; CHECK-NEXT: [[B1:%.*]] = shl <4 x i32> [[Z]], [[X]] -; CHECK-NEXT: [[R1:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R1]] -; - %b0 = shl <4 x i32> %x, %y - %b1 = shl <4 x i32> %z, %x - %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> - ret <4 x i32> %r -} - -; negative test - mismatched opcodes - -define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) { -; CHECK-LABEL: define <2 x i64> @shuf_sub_add_v2i64_yy( -; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[B0:%.*]] = sub <2 x i64> [[X]], [[Y]] -; CHECK-NEXT: [[B1:%.*]] = add <2 x i64> [[Z]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> -; CHECK-NEXT: ret <2 x i64> [[R]] -; - %b0 = sub <2 x i64> %x, %y - %b1 = add <2 x i64> %z, %y - %r = shufflevector <2 x i64> %b0, <2 x i64> %b1, <2 x i32> - ret <2 x i64> %r -} - -; negative test - type change via shuffle - -define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) { -; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type( -; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]] -; CHECK-NEXT: [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> -; CHECK-NEXT: ret <8 x float> [[R]] -; - %b0 = fmul <4 x float> %x, %y - %b1 = fmul <4 x float> %z, %x - %r = shufflevector <4 x float> %b0, <4 x float> %b1, <8 x i32> - ret <8 x float> %r -} - -; negative test - uses - -define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: define <4 x i32> @shuf_lshr_v4i32_yy_use1( -; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[B0:%.*]] = lshr <4 x i32> [[X]], [[Y]] -; CHECK-NEXT: call void @use(<4 x i32> [[B0]]) -; CHECK-NEXT: [[B1:%.*]] = lshr <4 x i32> [[Z]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] -; - %b0 = lshr <4 x i32> %x, %y - call void @use(<4 x i32> %b0) - %b1 = lshr <4 x i32> %z, %y - %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> - ret <4 x i32> %r -} - -; negative test - uses - -define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) { -; CHECK-LABEL: define <4 x i32> @shuf_mul_v4i32_yy_use2( -; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[B0:%.*]] = mul <4 x i32> [[X]], [[Y]] -; CHECK-NEXT: [[B1:%.*]] = mul <4 x i32> [[Z]], [[Y]] -; CHECK-NEXT: call void @use(<4 x i32> [[B1]]) -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> -; CHECK-NEXT: ret <4 x i32> [[R]] -; - %b0 = mul <4 x i32> %x, %y - %b1 = mul <4 x i32> %z, %y - call void @use(<4 x i32> %b1) - %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> - ret <4 x i32> %r -} - -; negative test - must have matching operand - -define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) { -; CHECK-LABEL: define <4 x float> @shuf_fadd_v4f32_no_common_op( -; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[B0:%.*]] = fadd <4 x float> [[X]], [[Y]] -; CHECK-NEXT: [[B1:%.*]] = fadd <4 x float> [[Z]], [[W]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] -; - %b0 = fadd <4 x float> %x, %y - %b1 = fadd <4 x float> %z, %w - %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> - ret <4 x float> %r -} - -; negative test - binops may be relatively cheap - -define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf(<16 x i16> %x, <16 x i16> %y, <16 x i16> %z) { -; CHECK-LABEL: define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf( -; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]], <16 x i16> [[Z:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[B0:%.*]] = and <16 x i16> [[X]], [[Y]] -; CHECK-NEXT: [[B1:%.*]] = and <16 x i16> [[Y]], [[Z]] -; CHECK-NEXT: [[R:%.*]] = shufflevector <16 x i16> [[B0]], <16 x i16> [[B1]], <16 x i32> -; CHECK-NEXT: ret <16 x i16> [[R]] -; - %b0 = and <16 x i16> %x, %y - %b1 = and <16 x i16> %y, %z - %r = shufflevector <16 x i16> %b0, <16 x i16> %b1, <16 x i32> - ret <16 x i16> %r -} From 6133878227efc30355c02c2f089e06ce58231a3d Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Tue, 16 Apr 2024 06:57:36 -0700 Subject: [PATCH 105/300] [OpenACC] Implement `self` clause for compute constructs (#88760) `self` clauses on compute constructs take an optional condition expression. We again limit the implementation to ONLY compute constructs to ensure we get all the rules correct for others. However, this one will be particularly complicated, as it takes a `var-list` for `update`, so when we get to that construct/clause combination, we need to do that as well. This patch also furthers uses of the `OpenACCClauses.def` as it became useful while implementing this (as well as some other minor refactors as I went through). Finally, `self` and `if` clauses have an interaction with each other, if an `if` clause evaluates to `true`, the `self` clause has no effect. While this is intended and can be used 'meaningfully', we are warning on this with a very granular warning, so that this edge case will be noticed by newer users, but can be disabled trivially. --- clang/include/clang/AST/OpenACCClause.h | 65 ++++-------- clang/include/clang/AST/StmtOpenACC.h | 4 +- .../clang/Basic/DiagnosticSemaKinds.td | 4 + clang/include/clang/Basic/OpenACCClauses.def | 1 + clang/include/clang/Basic/OpenACCKinds.h | 6 ++ clang/include/clang/Sema/SemaOpenACC.h | 18 +++- clang/lib/AST/OpenACCClause.cpp | 26 +++++ clang/lib/AST/StmtProfile.cpp | 5 + clang/lib/AST/TextNodeDumper.cpp | 1 + clang/lib/Parse/ParseOpenACC.cpp | 16 ++- clang/lib/Sema/SemaOpenACC.cpp | 68 ++++++++++-- clang/lib/Sema/TreeTransform.h | 100 +++++++++++++----- clang/lib/Serialization/ASTReader.cpp | 7 +- clang/lib/Serialization/ASTWriter.cpp | 9 +- clang/test/ParserOpenACC/parse-clauses.c | 5 +- .../compute-construct-clause-ast.cpp | 91 ++++++++++++++++ .../compute-construct-self-clause.c | 82 ++++++++++++++ .../compute-construct-self-clause.cpp | 99 +++++++++++++++++ clang/tools/libclang/CIndex.cpp | 4 + 19 files changed, 513 insertions(+), 98 deletions(-) create mode 100644 clang/test/SemaOpenACC/compute-construct-self-clause.c create mode 100644 clang/test/SemaOpenACC/compute-construct-self-clause.cpp diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h index 401b8e904a1b7a..07587849eb1219 100644 --- a/clang/include/clang/AST/OpenACCClause.h +++ b/clang/include/clang/AST/OpenACCClause.h @@ -145,6 +145,17 @@ class OpenACCIfClause : public OpenACCClauseWithCondition { SourceLocation EndLoc); }; +/// A 'self' clause, which has an optional condition expression. +class OpenACCSelfClause : public OpenACCClauseWithCondition { + OpenACCSelfClause(SourceLocation BeginLoc, SourceLocation LParenLoc, + Expr *ConditionExpr, SourceLocation EndLoc); + +public: + static OpenACCSelfClause *Create(const ASTContext &C, SourceLocation BeginLoc, + SourceLocation LParenLoc, + Expr *ConditionExpr, SourceLocation EndLoc); +}; + template class OpenACCClauseVisitor { Impl &getDerived() { return static_cast(*this); } @@ -159,53 +170,13 @@ template class OpenACCClauseVisitor { return; switch (C->getClauseKind()) { - case OpenACCClauseKind::Default: - VisitDefaultClause(*cast(C)); - return; - case OpenACCClauseKind::If: - VisitIfClause(*cast(C)); - return; - case OpenACCClauseKind::Finalize: - case OpenACCClauseKind::IfPresent: - case OpenACCClauseKind::Seq: - case OpenACCClauseKind::Independent: - case OpenACCClauseKind::Auto: - case OpenACCClauseKind::Worker: - case OpenACCClauseKind::Vector: - case OpenACCClauseKind::NoHost: - case OpenACCClauseKind::Self: - case OpenACCClauseKind::Copy: - case OpenACCClauseKind::UseDevice: - case OpenACCClauseKind::Attach: - case OpenACCClauseKind::Delete: - case OpenACCClauseKind::Detach: - case OpenACCClauseKind::Device: - case OpenACCClauseKind::DevicePtr: - case OpenACCClauseKind::DeviceResident: - case OpenACCClauseKind::FirstPrivate: - case OpenACCClauseKind::Host: - case OpenACCClauseKind::Link: - case OpenACCClauseKind::NoCreate: - case OpenACCClauseKind::Present: - case OpenACCClauseKind::Private: - case OpenACCClauseKind::CopyOut: - case OpenACCClauseKind::CopyIn: - case OpenACCClauseKind::Create: - case OpenACCClauseKind::Reduction: - case OpenACCClauseKind::Collapse: - case OpenACCClauseKind::Bind: - case OpenACCClauseKind::VectorLength: - case OpenACCClauseKind::NumGangs: - case OpenACCClauseKind::NumWorkers: - case OpenACCClauseKind::DeviceNum: - case OpenACCClauseKind::DefaultAsync: - case OpenACCClauseKind::DeviceType: - case OpenACCClauseKind::DType: - case OpenACCClauseKind::Async: - case OpenACCClauseKind::Tile: - case OpenACCClauseKind::Gang: - case OpenACCClauseKind::Wait: - case OpenACCClauseKind::Invalid: +#define VISIT_CLAUSE(CLAUSE_NAME) \ + case OpenACCClauseKind::CLAUSE_NAME: \ + Visit##CLAUSE_NAME##Clause(*cast(C)); \ + return; +#include "clang/Basic/OpenACCClauses.def" + + default: llvm_unreachable("Clause visitor not yet implemented"); } llvm_unreachable("Invalid Clause kind"); diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h index 419cb6cada0bc7..66f8f844e0b29e 100644 --- a/clang/include/clang/AST/StmtOpenACC.h +++ b/clang/include/clang/AST/StmtOpenACC.h @@ -142,9 +142,7 @@ class OpenACCComputeConstruct final Stmt *StructuredBlock) : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, K, Start, End, StructuredBlock) { - assert((K == OpenACCDirectiveKind::Parallel || - K == OpenACCDirectiveKind::Serial || - K == OpenACCDirectiveKind::Kernels) && + assert(isOpenACCComputeDirectiveKind(K) && "Only parallel, serial, and kernels constructs should be " "represented by this type"); diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 5ec0218aedfe86..44f802c0c28e84 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12274,4 +12274,8 @@ def note_acc_branch_into_compute_construct : Note<"invalid branch into OpenACC Compute Construct">; def note_acc_branch_out_of_compute_construct : Note<"invalid branch out of OpenACC Compute Construct">; +def warn_acc_if_self_conflict + : Warning<"OpenACC construct 'self' has no effect when an 'if' clause " + "evaluates to true">, + InGroup>; } // end of sema component. diff --git a/clang/include/clang/Basic/OpenACCClauses.def b/clang/include/clang/Basic/OpenACCClauses.def index 7fd2720e02ce22..378495d2c0909a 100644 --- a/clang/include/clang/Basic/OpenACCClauses.def +++ b/clang/include/clang/Basic/OpenACCClauses.def @@ -17,5 +17,6 @@ VISIT_CLAUSE(Default) VISIT_CLAUSE(If) +VISIT_CLAUSE(Self) #undef VISIT_CLAUSE diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h index 3414df99991701..e3f74178433285 100644 --- a/clang/include/clang/Basic/OpenACCKinds.h +++ b/clang/include/clang/Basic/OpenACCKinds.h @@ -146,6 +146,12 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &Out, return printOpenACCDirectiveKind(Out, K); } +inline bool isOpenACCComputeDirectiveKind(OpenACCDirectiveKind K) { + return K == OpenACCDirectiveKind::Parallel || + K == OpenACCDirectiveKind::Serial || + K == OpenACCDirectiveKind::Kernels; +} + enum class OpenACCAtomicKind { Read, Write, diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h index c1fe0f5b9c0f6b..329dc3945fa2a6 100644 --- a/clang/include/clang/Sema/SemaOpenACC.h +++ b/clang/include/clang/Sema/SemaOpenACC.h @@ -44,7 +44,8 @@ class SemaOpenACC : public SemaBase { Expr *ConditionExpr; }; - std::variant Details; + std::variant Details = + std::monostate{}; public: OpenACCParsedClause(OpenACCDirectiveKind DirKind, @@ -72,8 +73,17 @@ class SemaOpenACC : public SemaBase { } Expr *getConditionExpr() { - assert(ClauseKind == OpenACCClauseKind::If && + assert((ClauseKind == OpenACCClauseKind::If || + (ClauseKind == OpenACCClauseKind::Self && + DirKind != OpenACCDirectiveKind::Update)) && "Parsed clause kind does not have a condition expr"); + + // 'self' has an optional ConditionExpr, so be tolerant of that. This will + // assert in variant otherwise. + if (ClauseKind == OpenACCClauseKind::Self && + std::holds_alternative(Details)) + return nullptr; + return std::get(Details).ConditionExpr; } @@ -87,7 +97,9 @@ class SemaOpenACC : public SemaBase { } void setConditionDetails(Expr *ConditionExpr) { - assert(ClauseKind == OpenACCClauseKind::If && + assert((ClauseKind == OpenACCClauseKind::If || + (ClauseKind == OpenACCClauseKind::Self && + DirKind != OpenACCDirectiveKind::Update)) && "Parsed clause kind does not have a condition expr"); // In C++ we can count on this being a 'bool', but in C this gets left as // some sort of scalar that codegen will have to take care of converting. diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp index dcb512cb514179..9c259c8f9bd0a1 100644 --- a/clang/lib/AST/OpenACCClause.cpp +++ b/clang/lib/AST/OpenACCClause.cpp @@ -48,6 +48,26 @@ OpenACCIfClause::OpenACCIfClause(SourceLocation BeginLoc, "Condition expression type not scalar/dependent"); } +OpenACCSelfClause *OpenACCSelfClause::Create(const ASTContext &C, + SourceLocation BeginLoc, + SourceLocation LParenLoc, + Expr *ConditionExpr, + SourceLocation EndLoc) { + void *Mem = C.Allocate(sizeof(OpenACCIfClause), alignof(OpenACCIfClause)); + return new (Mem) + OpenACCSelfClause(BeginLoc, LParenLoc, ConditionExpr, EndLoc); +} + +OpenACCSelfClause::OpenACCSelfClause(SourceLocation BeginLoc, + SourceLocation LParenLoc, + Expr *ConditionExpr, SourceLocation EndLoc) + : OpenACCClauseWithCondition(OpenACCClauseKind::Self, BeginLoc, LParenLoc, + ConditionExpr, EndLoc) { + assert((!ConditionExpr || ConditionExpr->isInstantiationDependent() || + ConditionExpr->getType()->isScalarType()) && + "Condition expression type not scalar/dependent"); +} + OpenACCClause::child_range OpenACCClause::children() { switch (getClauseKind()) { default: @@ -72,3 +92,9 @@ void OpenACCClausePrinter::VisitDefaultClause(const OpenACCDefaultClause &C) { void OpenACCClausePrinter::VisitIfClause(const OpenACCIfClause &C) { OS << "if(" << C.getConditionExpr() << ")"; } + +void OpenACCClausePrinter::VisitSelfClause(const OpenACCSelfClause &C) { + OS << "self"; + if (const Expr *CondExpr = C.getConditionExpr()) + OS << "(" << CondExpr << ")"; +} diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 789e4634bd293b..b26d804c6f079b 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -2491,6 +2491,11 @@ void OpenACCClauseProfiler::VisitIfClause(const OpenACCIfClause &Clause) { "if clause requires a valid condition expr"); Profiler.VisitStmt(Clause.getConditionExpr()); } + +void OpenACCClauseProfiler::VisitSelfClause(const OpenACCSelfClause &Clause) { + if (Clause.hasConditionExpr()) + Profiler.VisitStmt(Clause.getConditionExpr()); +} } // namespace void StmtProfiler::VisitOpenACCComputeConstruct( diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 688daa64d61974..ff5b3df2d6dfac 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -398,6 +398,7 @@ void TextNodeDumper::Visit(const OpenACCClause *C) { OS << '(' << cast(C)->getDefaultClauseKind() << ')'; break; case OpenACCClauseKind::If: + case OpenACCClauseKind::Self: // The condition expression will be printed as a part of the 'children', // but print 'clause' here so it is clear what is happening from the dump. OS << " clause"; diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index 91f2b8afcf0c24..123be476e928ee 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -835,19 +835,23 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams( case OpenACCClauseKind::Default: { Token DefKindTok = getCurToken(); - if (expectIdentifierOrKeyword(*this)) - break; + if (expectIdentifierOrKeyword(*this)) { + Parens.skipToEnd(); + return OpenACCCanContinue(); + } ConsumeToken(); OpenACCDefaultClauseKind DefKind = getOpenACCDefaultClauseKind(DefKindTok); - if (DefKind == OpenACCDefaultClauseKind::Invalid) + if (DefKind == OpenACCDefaultClauseKind::Invalid) { Diag(DefKindTok, diag::err_acc_invalid_default_clause_kind); - else - ParsedClause.setDefaultDetails(DefKind); + Parens.skipToEnd(); + return OpenACCCanContinue(); + } + ParsedClause.setDefaultDetails(DefKind); break; } case OpenACCClauseKind::If: { @@ -977,6 +981,8 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams( case OpenACCClauseKind::Self: { assert(DirKind != OpenACCDirectiveKind::Update); ExprResult CondExpr = ParseOpenACCConditionExpr(); + ParsedClause.setConditionDetails(CondExpr.isUsable() ? CondExpr.get() + : nullptr); if (CondExpr.isInvalid()) { Parens.skipToEnd(); diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index 1249136c87650b..59f65eaf47a6da 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -15,6 +15,7 @@ #include "clang/AST/StmtOpenACC.h" #include "clang/Basic/DiagnosticSema.h" #include "clang/Sema/Sema.h" +#include "llvm/Support/Casting.h" using namespace clang; @@ -76,6 +77,19 @@ bool doesClauseApplyToDirective(OpenACCDirectiveKind DirectiveKind, default: return false; } + case OpenACCClauseKind::Self: + switch (DirectiveKind) { + case OpenACCDirectiveKind::Parallel: + case OpenACCDirectiveKind::Serial: + case OpenACCDirectiveKind::Kernels: + case OpenACCDirectiveKind::Update: + case OpenACCDirectiveKind::ParallelLoop: + case OpenACCDirectiveKind::SerialLoop: + case OpenACCDirectiveKind::KernelsLoop: + return true; + default: + return false; + } default: // Do nothing so we can go to the 'unimplemented' diagnostic instead. return true; @@ -121,9 +135,7 @@ SemaOpenACC::ActOnClause(ArrayRef ExistingClauses, // Restrictions only properly implemented on 'compute' constructs, and // 'compute' constructs are the only construct that can do anything with // this yet, so skip/treat as unimplemented in this case. - if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Parallel && - Clause.getDirectiveKind() != OpenACCDirectiveKind::Serial && - Clause.getDirectiveKind() != OpenACCDirectiveKind::Kernels) + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) break; // Don't add an invalid clause to the AST. @@ -146,9 +158,7 @@ SemaOpenACC::ActOnClause(ArrayRef ExistingClauses, // Restrictions only properly implemented on 'compute' constructs, and // 'compute' constructs are the only construct that can do anything with // this yet, so skip/treat as unimplemented in this case. - if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Parallel && - Clause.getDirectiveKind() != OpenACCDirectiveKind::Serial && - Clause.getDirectiveKind() != OpenACCDirectiveKind::Kernels) + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) break; // There is no prose in the standard that says duplicates aren't allowed, @@ -160,12 +170,54 @@ SemaOpenACC::ActOnClause(ArrayRef ExistingClauses, // The parser has ensured that we have a proper condition expr, so there // isn't really much to do here. - // TODO OpenACC: When we implement 'self', this clauses causes us to - // 'ignore' the self clause, so we should implement a warning here. + // If the 'if' clause is true, it makes the 'self' clause have no effect, + // diagnose that here. + // TODO OpenACC: When we add these two to other constructs, we might not + // want to warn on this (for example, 'update'). + const auto *Itr = + llvm::find_if(ExistingClauses, llvm::IsaPred); + if (Itr != ExistingClauses.end()) { + Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict); + Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); + } + return OpenACCIfClause::Create( getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getConditionExpr(), Clause.getEndLoc()); } + + case OpenACCClauseKind::Self: { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + break; + + // TODO OpenACC: When we implement this for 'update', this takes a + // 'var-list' instead of a condition expression, so semantics/handling has + // to happen differently here. + + // There is no prose in the standard that says duplicates aren't allowed, + // but this diagnostic is present in other compilers, as well as makes + // sense. + if (checkAlreadyHasClauseOfKind(*this, ExistingClauses, Clause)) + return nullptr; + + // If the 'if' clause is true, it makes the 'self' clause have no effect, + // diagnose that here. + // TODO OpenACC: When we add these two to other constructs, we might not + // want to warn on this (for example, 'update'). + const auto *Itr = + llvm::find_if(ExistingClauses, llvm::IsaPred); + if (Itr != ExistingClauses.end()) { + Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict); + Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); + } + + return OpenACCSelfClause::Create( + getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), + Clause.getConditionExpr(), Clause.getEndLoc()); + } default: break; } diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index acc2d7ff9d427c..0c7fdb357235e1 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -11088,6 +11088,77 @@ OMPClause *TreeTransform::TransformOMPXBareClause(OMPXBareClause *C) { //===----------------------------------------------------------------------===// // OpenACC transformation //===----------------------------------------------------------------------===// +namespace { +template +class OpenACCClauseTransform final + : public OpenACCClauseVisitor> { + TreeTransform &Self; + SemaOpenACC::OpenACCParsedClause &ParsedClause; + OpenACCClause *NewClause = nullptr; + +public: + OpenACCClauseTransform(TreeTransform &Self, + SemaOpenACC::OpenACCParsedClause &PC) + : Self(Self), ParsedClause(PC) {} + + OpenACCClause *CreatedClause() const { return NewClause; } + +#define VISIT_CLAUSE(CLAUSE_NAME) \ + void Visit##CLAUSE_NAME##Clause(const OpenACC##CLAUSE_NAME##Clause &Clause); +#include "clang/Basic/OpenACCClauses.def" +}; + +template +void OpenACCClauseTransform::VisitDefaultClause( + const OpenACCDefaultClause &C) { + ParsedClause.setDefaultDetails(C.getDefaultClauseKind()); + + NewClause = OpenACCDefaultClause::Create( + Self.getSema().getASTContext(), ParsedClause.getDefaultClauseKind(), + ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), + ParsedClause.getEndLoc()); +} + +template +void OpenACCClauseTransform::VisitIfClause(const OpenACCIfClause &C) { + Expr *Cond = const_cast(C.getConditionExpr()); + assert(Cond && "If constructed with invalid Condition"); + Sema::ConditionResult Res = Self.TransformCondition( + Cond->getExprLoc(), /*Var=*/nullptr, Cond, Sema::ConditionKind::Boolean); + + if (Res.isInvalid() || !Res.get().second) + return; + + ParsedClause.setConditionDetails(Res.get().second); + + NewClause = OpenACCIfClause::Create( + Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), + ParsedClause.getLParenLoc(), ParsedClause.getConditionExpr(), + ParsedClause.getEndLoc()); +} + +template +void OpenACCClauseTransform::VisitSelfClause( + const OpenACCSelfClause &C) { + + if (C.hasConditionExpr()) { + Expr *Cond = const_cast(C.getConditionExpr()); + Sema::ConditionResult Res = + Self.TransformCondition(Cond->getExprLoc(), /*Var=*/nullptr, Cond, + Sema::ConditionKind::Boolean); + + if (Res.isInvalid() || !Res.get().second) + return; + + ParsedClause.setConditionDetails(Res.get().second); + } + + NewClause = OpenACCSelfClause::Create( + Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), + ParsedClause.getLParenLoc(), ParsedClause.getConditionExpr(), + ParsedClause.getEndLoc()); +} +} // namespace template OpenACCClause *TreeTransform::TransformOpenACCClause( ArrayRef ExistingClauses, @@ -11100,33 +11171,10 @@ OpenACCClause *TreeTransform::TransformOpenACCClause( if (const auto *WithParms = dyn_cast(OldClause)) ParsedClause.setLParenLoc(WithParms->getLParenLoc()); - switch (OldClause->getClauseKind()) { - case OpenACCClauseKind::Default: - // There is nothing to do here as nothing dependent can appear in this - // clause. So just set the values so Sema can set the right value. - ParsedClause.setDefaultDetails( - cast(OldClause)->getDefaultClauseKind()); - break; - case OpenACCClauseKind::If: { - Expr *Cond = const_cast( - cast(OldClause)->getConditionExpr()); - assert(Cond && "If constructed with invalid Condition"); - Sema::ConditionResult Res = - TransformCondition(Cond->getExprLoc(), /*Var=*/nullptr, Cond, - Sema::ConditionKind::Boolean); - - if (Res.isInvalid() || !Res.get().second) - return nullptr; - - ParsedClause.setConditionDetails(Res.get().second); - break; - } - default: - assert(false && "Unhandled OpenACC clause in TreeTransform"); - return nullptr; - } + OpenACCClauseTransform Transform{*this, ParsedClause}; + Transform.Visit(OldClause); - return getSema().OpenACC().ActOnClause(ExistingClauses, ParsedClause); + return Transform.CreatedClause(); } template diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index f47d540ea4b86d..cf0726460bfca7 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -11781,6 +11781,12 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() { return OpenACCIfClause::Create(getContext(), BeginLoc, LParenLoc, CondExpr, EndLoc); } + case OpenACCClauseKind::Self: { + SourceLocation LParenLoc = readSourceLocation(); + Expr *CondExpr = readBool() ? readSubExpr() : nullptr; + return OpenACCSelfClause::Create(getContext(), BeginLoc, LParenLoc, + CondExpr, EndLoc); + } case OpenACCClauseKind::Finalize: case OpenACCClauseKind::IfPresent: case OpenACCClauseKind::Seq: @@ -11789,7 +11795,6 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() { case OpenACCClauseKind::Worker: case OpenACCClauseKind::Vector: case OpenACCClauseKind::NoHost: - case OpenACCClauseKind::Self: case OpenACCClauseKind::Copy: case OpenACCClauseKind::UseDevice: case OpenACCClauseKind::Attach: diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index ce6fa1feb1eeb3..b2a078b6d80f46 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -7524,6 +7524,14 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { AddStmt(const_cast(IC->getConditionExpr())); return; } + case OpenACCClauseKind::Self: { + const auto *SC = cast(C); + writeSourceLocation(SC->getLParenLoc()); + writeBool(SC->hasConditionExpr()); + if (SC->hasConditionExpr()) + AddStmt(const_cast(SC->getConditionExpr())); + return; + } case OpenACCClauseKind::Finalize: case OpenACCClauseKind::IfPresent: case OpenACCClauseKind::Seq: @@ -7532,7 +7540,6 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { case OpenACCClauseKind::Worker: case OpenACCClauseKind::Vector: case OpenACCClauseKind::NoHost: - case OpenACCClauseKind::Self: case OpenACCClauseKind::Copy: case OpenACCClauseKind::UseDevice: case OpenACCClauseKind::Attach: diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c index 2369df58308a72..4462f0df540f2d 100644 --- a/clang/test/ParserOpenACC/parse-clauses.c +++ b/clang/test/ParserOpenACC/parse-clauses.c @@ -376,16 +376,13 @@ void SelfClause() { #pragma acc serial self(i > j, seq for(;;){} - // expected-warning@+2{{left operand of comma operator has no effect}} - // expected-warning@+1{{OpenACC clause 'self' not yet implemented, clause ignored}} + // expected-warning@+1{{left operand of comma operator has no effect}} #pragma acc serial self(i, j) for(;;){} - // expected-warning@+1{{OpenACC clause 'self' not yet implemented, clause ignored}} #pragma acc serial self(i > j) for(;;){} - // expected-warning@+2{{OpenACC clause 'self' not yet implemented, clause ignored}} // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}} #pragma acc serial self(1+5>3), seq for(;;){} diff --git a/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp b/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp index 018f0b68c78109..6d2efcf81eb6e4 100644 --- a/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp +++ b/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp @@ -110,6 +110,50 @@ void TemplFunc() { // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt +#pragma acc serial self + while(true); + // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial + // CHECK-NEXT: self clause + // CHECK-NEXT: WhileStmt + // CHECK-NEXT: CXXBoolLiteralExpr + // CHECK-NEXT: NullStmt + +#pragma acc kernels self(T::SomeFloat) + while(true); + // CHECK-NEXT: OpenACCComputeConstruct{{.*}}kernels + // CHECK-NEXT: self clause + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '' lvalue + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' + // CHECK-NEXT: WhileStmt + // CHECK-NEXT: CXXBoolLiteralExpr + // CHECK-NEXT: NullStmt + +#pragma acc parallel self(T::SomeFloat) if (T::SomeFloat) + while(true); + // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel + // CHECK-NEXT: self clause + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '' lvalue + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' + // CHECK-NEXT: if clause + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '' lvalue + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' + // CHECK-NEXT: WhileStmt + // CHECK-NEXT: CXXBoolLiteralExpr + // CHECK-NEXT: NullStmt + +#pragma acc serial if(T::SomeFloat) self(T::SomeFloat) + while(true); + // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial + // CHECK-NEXT: if clause + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '' lvalue + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' + // CHECK-NEXT: self clause + // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '' lvalue + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T' + // CHECK-NEXT: WhileStmt + // CHECK-NEXT: CXXBoolLiteralExpr + // CHECK-NEXT: NullStmt + // Match the instantiation: // CHECK: FunctionDecl{{.*}}TemplFunc{{.*}}implicit_instantiation // CHECK-NEXT: TemplateArgument type 'InstTy' @@ -171,6 +215,53 @@ void TemplFunc() { // CHECK-NEXT: WhileStmt // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt + + // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial + // CHECK-NEXT: self clause + // CHECK-NEXT: WhileStmt + // CHECK-NEXT: CXXBoolLiteralExpr + // CHECK-NEXT: NullStmt + + // CHECK-NEXT: OpenACCComputeConstruct{{.*}}kernels + // CHECK-NEXT: self clause + // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' + // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' + // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy' + // CHECK-NEXT: WhileStmt + // CHECK-NEXT: CXXBoolLiteralExpr + // CHECK-NEXT: NullStmt + + // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel + // CHECK-NEXT: self clause + // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' + // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' + // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy' + // CHECK-NEXT: if clause + // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' + // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' + // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy' + // CHECK-NEXT: WhileStmt + // CHECK-NEXT: CXXBoolLiteralExpr + // CHECK-NEXT: NullStmt + + // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial + // CHECK-NEXT: if clause + // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' + // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' + // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy' + // CHECK-NEXT: self clause + // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' + // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' + // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy' + // CHECK-NEXT: WhileStmt + // CHECK-NEXT: CXXBoolLiteralExpr + // CHECK-NEXT: NullStmt + } struct BoolConversion{ operator bool() const;}; diff --git a/clang/test/SemaOpenACC/compute-construct-self-clause.c b/clang/test/SemaOpenACC/compute-construct-self-clause.c new file mode 100644 index 00000000000000..fbed2953419a2e --- /dev/null +++ b/clang/test/SemaOpenACC/compute-construct-self-clause.c @@ -0,0 +1,82 @@ +// RUN: %clang_cc1 %s -fopenacc -verify + +void BoolExpr(int *I, float *F) { + typedef struct {} SomeStruct; + struct C{}; + // expected-error@+1{{expected expression}} +#pragma acc parallel self (struct C f()) + while(0); + + // expected-error@+1{{unexpected type name 'SomeStruct': expected expression}} +#pragma acc serial self (SomeStruct) + while(0); + + // expected-error@+1{{unexpected type name 'SomeStruct': expected expression}} +#pragma acc serial self (SomeStruct()) + while(0); + + SomeStruct S; + // expected-error@+1{{statement requires expression of scalar type ('SomeStruct' invalid)}} +#pragma acc serial self (S) + while(0); + +#pragma acc parallel self (I) + while(0); + +#pragma acc serial self (F) + while(0); + +#pragma acc kernels self (*I < *F) + while(0); +} + +void WarnMaybeNotUsed(int val1, int val2) { + + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel self if(val1) + while(0); + + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel self(val1) if(val1) + while(0); + + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel if(val1) self + while(0); + + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel if(val1) self(val2) + while(0); + + // The below don't warn because one side or the other has an error, thus is + // not added to the AST. + + // expected-error@+1{{use of undeclared identifier 'invalid'}} +#pragma acc parallel self if(invalid) + while(0); + + // expected-error@+1{{use of undeclared identifier 'invalid'}} +#pragma acc parallel self(invalid) if(val1) + while(0); + + // expected-error@+2{{expected expression}} + // expected-error@+1{{use of undeclared identifier 'invalid'}} +#pragma acc parallel self() if(invalid) + while(0); + + // expected-error@+1{{use of undeclared identifier 'invalid'}} +#pragma acc parallel if(invalid) self + while(0); + + // expected-error@+1{{use of undeclared identifier 'invalid'}} +#pragma acc parallel if(val2) self(invalid) + while(0); + + // expected-error@+1{{use of undeclared identifier 'invalid'}} +#pragma acc parallel if(invalid) self(val1) + while(0); +} diff --git a/clang/test/SemaOpenACC/compute-construct-self-clause.cpp b/clang/test/SemaOpenACC/compute-construct-self-clause.cpp new file mode 100644 index 00000000000000..60edbdc2b1191b --- /dev/null +++ b/clang/test/SemaOpenACC/compute-construct-self-clause.cpp @@ -0,0 +1,99 @@ +// RUN: %clang_cc1 %s -fopenacc -verify + +struct NoBoolConversion{}; +struct BoolConversion{ + operator bool(); +}; + +template +void BoolExpr() { + // expected-error@+1{{value of type 'NoBoolConversion' is not contextually convertible to 'bool'}} +#pragma acc parallel self (NoBoolConversion{}) + while(0); + // expected-error@+2{{no member named 'NotValid' in 'NoBoolConversion'}} + // expected-note@#INST{{in instantiation of function template specialization}} +#pragma acc parallel self (T::NotValid) + while(0); + +#pragma acc parallel self (BoolConversion{}) + while(0); + + // expected-error@+1{{value of type 'NoBoolConversion' is not contextually convertible to 'bool'}} +#pragma acc parallel self (T{}) + while(0); + +#pragma acc parallel self (U{}) + while(0); +} + +struct HasBool { + static constexpr bool B = true; +}; + +template +void WarnMaybeNotUsed() { + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel self if(T::B) + while(0); + + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel self(T::B) if(T::B) + while(0); + + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel if(T::B) self + while(0); + + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel if(T::B) self(T::B) + while(0); + + // We still warn in the cases of dependent failures, since the diagnostic + // happens immediately rather than during instantiation. + + // expected-error@+4{{no member named 'Invalid' in 'HasBool'}} + // expected-note@#NOT_USED_INST{{in instantiation of function template specialization 'WarnMaybeNotUsed' requested here}} + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel self if(T::Invalid) + while(0); + + // expected-error@+3{{no member named 'Invalid' in 'HasBool'}} + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel self(T::Invalid) if(T::B) + while(0); + + // expected-error@+3{{no member named 'Invalid' in 'HasBool'}} + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel self(T::B) if(T::Invalid) + while(0); + + // expected-error@+3{{no member named 'Invalid' in 'HasBool'}} + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel if(T::Invalid) self + while(0); + + // expected-error@+3{{no member named 'Invalid' in 'HasBool'}} + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel if(T::Invalid) self(T::B) + while(0); + + // expected-error@+3{{no member named 'Invalid' in 'HasBool'}} + // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel if(T::B) self(T::Invalid) + while(0); +} + +void Instantiate() { + BoolExpr(); // #INST + WarnMaybeNotUsed(); // #NOT_USED_INST +} diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index f304786ff9dffd..2ef599d2cd26fa 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -2791,6 +2791,10 @@ void OpenACCClauseEnqueue::VisitDefaultClause(const OpenACCDefaultClause &C) {} void OpenACCClauseEnqueue::VisitIfClause(const OpenACCIfClause &C) { Visitor.AddStmt(C.getConditionExpr()); } +void OpenACCClauseEnqueue::VisitSelfClause(const OpenACCSelfClause &C) { + if (C.hasConditionExpr()) + Visitor.AddStmt(C.getConditionExpr()); +} } // namespace void EnqueueVisitor::EnqueueChildren(const OpenACCClause *C) { From ac791888bbbe58651e597cf7a4b2276424b77a92 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Tue, 16 Apr 2024 15:58:16 +0200 Subject: [PATCH 106/300] [bazel] Add missing dependency for 1c076b43c294c7d29d99dd50f6853b33a5b99789 --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 653c4bd30600a2..03386549a01163 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -11631,6 +11631,7 @@ cc_library( ":DialectUtils", ":FuncDialect", ":IR", + ":InferTypeOpInterface", ":LinalgDialect", ":MathDialect", ":Pass", From a0f8191af3945482f0f7a7c8f030e8c519a795b7 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 16 Apr 2024 13:12:52 +0100 Subject: [PATCH 107/300] [libclc] Give built bytecode objects a .bc extension. NFC --- libclc/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 7528228b3b7f9b..302f559dba7a04 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -326,7 +326,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) # Generated files are given just as file names, which we must make # absolute to the binary directory. set( input_file ${CMAKE_CURRENT_BINARY_DIR}/${file} ) - set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${file}.o" ) + set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${file}.bc" ) else() # Other files are originally relative to each SOURCE file, which are # then make relative to the libclc root directory. We must normalize @@ -336,7 +336,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) get_filename_component( abs_path ${file} ABSOLUTE BASE_DIR ${PROJECT_SOURCE_DIR} ) file( RELATIVE_PATH root_rel_path ${PROJECT_SOURCE_DIR} ${abs_path} ) set( input_file ${PROJECT_SOURCE_DIR}/${file} ) - set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${root_rel_path}.o" ) + set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${root_rel_path}.bc" ) endif() get_filename_component( file_dir ${file} DIRECTORY ) From 3d118f92081ea0c7048749dc5d08c8e8217be4eb Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 16 Apr 2024 13:59:59 +0100 Subject: [PATCH 108/300] [libclc] Fix dependencies between targets We need file-level - not target-level - dependencies for these custom commands to re-trigger when their dependencies change. --- libclc/CMakeLists.txt | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 302f559dba7a04..ed2764847e709e 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -364,7 +364,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( spv_suffix ${arch_suffix}.spv ) add_custom_command( OUTPUT ${spv_suffix} COMMAND ${LLVM_SPIRV} ${spvflags} -o ${spv_suffix} ${builtins_link_lib} - DEPENDS ${builtins_link_lib_tgt} + DEPENDS ${builtins_link_lib} ) add_custom_target( "prepare-${spv_suffix}" ALL DEPENDS "${spv_suffix}" ) install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix} @@ -376,7 +376,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) add_custom_command( OUTPUT ${builtins_opt_lib_tgt}.bc COMMAND libclc::opt ${opt_flags} -o ${builtins_opt_lib_tgt}.bc ${builtins_link_lib} - DEPENDS libclc::opt ${builtins_link_lib_tgt} + DEPENDS libclc::opt ${builtins_link_lib} ) add_custom_target( ${builtins_opt_lib_tgt} ALL DEPENDS ${builtins_opt_lib_tgt}.bc @@ -385,12 +385,13 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) PROPERTIES TARGET_FILE ${builtins_opt_lib_tgt}.bc ) + set( builtins_opt_lib $ ) + # Add prepare target set( obj_suffix ${arch_suffix}.bc ) add_custom_command( OUTPUT ${obj_suffix} - COMMAND prepare_builtins -o ${obj_suffix} - $ - DEPENDS ${builtins_opt_lib_tgt} prepare_builtins ) + COMMAND prepare_builtins -o ${obj_suffix} ${builtins_opt_lib} + DEPENDS ${builtins_opt_lib} prepare_builtins ) add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${obj_suffix} ) # nvptx-- targets don't include workitem builtins From 60de56c743c414240b293a8b8ee10bc2129d7e10 Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Tue, 16 Apr 2024 15:21:09 +0100 Subject: [PATCH 109/300] [ValueTracking] Restore isKnownNonZero parameter order. (#88873) Prior to #85863, the required parameters of llvm::isKnownNonZero were Value and DataLayout. After, they are Value, Depth, and SimplifyQuery, where SimplifyQuery is implicitly constructible from DataLayout. The change to move Depth before SimplifyQuery needed callers to be updated unnecessarily, and as commented in #85863, we actually want Depth to be after SimplifyQuery anyway so that it can be defaulted and the caller does not need to specify it. --- clang/lib/CodeGen/CGCall.cpp | 3 +- llvm/include/llvm/Analysis/ValueTracking.h | 2 +- llvm/lib/Analysis/BasicAliasAnalysis.cpp | 3 +- llvm/lib/Analysis/InstructionSimplify.cpp | 25 ++--- llvm/lib/Analysis/LazyValueInfo.cpp | 5 +- llvm/lib/Analysis/Loads.cpp | 4 +- llvm/lib/Analysis/ScalarEvolution.cpp | 2 +- llvm/lib/Analysis/ValueTracking.cpp | 106 +++++++++--------- llvm/lib/CodeGen/CodeGenPrepare.cpp | 2 +- .../Transforms/IPO/AttributorAttributes.cpp | 2 +- llvm/lib/Transforms/IPO/FunctionAttrs.cpp | 2 +- .../InstCombine/InstCombineAddSub.cpp | 2 +- .../InstCombine/InstCombineAndOrXor.cpp | 4 +- .../InstCombine/InstCombineCalls.cpp | 11 +- .../InstCombine/InstCombineCompares.cpp | 18 +-- .../Transforms/InstCombine/InstCombinePHI.cpp | 3 +- .../InstCombine/InstructionCombining.cpp | 2 +- .../Instrumentation/MemorySanitizer.cpp | 4 +- .../Utils/PromoteMemoryToRegister.cpp | 2 +- .../lib/Transforms/Utils/SimplifyLibCalls.cpp | 20 ++-- .../Transforms/Vectorize/VectorCombine.cpp | 2 +- llvm/unittests/Analysis/ValueTrackingTest.cpp | 13 +-- 22 files changed, 113 insertions(+), 124 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 7a0bc6fa77b889..3f5463a9a70e9d 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -4124,8 +4124,7 @@ static bool isProvablyNull(llvm::Value *addr) { } static bool isProvablyNonNull(Address Addr, CodeGenFunction &CGF) { - return llvm::isKnownNonZero(Addr.getBasePointer(), /*Depth=*/0, - CGF.CGM.getDataLayout()); + return llvm::isKnownNonZero(Addr.getBasePointer(), CGF.CGM.getDataLayout()); } /// Emit the actual writing-back of a writeback. diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index 9db0894162afca..e1c41b3b55ccfb 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -124,7 +124,7 @@ bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI); /// specified, perform context-sensitive analysis and return true if the /// pointer couldn't possibly be null at the specified instruction. /// Supports values with integer or pointer type and vectors of integers. -bool isKnownNonZero(const Value *V, unsigned Depth, const SimplifyQuery &Q); +bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth = 0); /// Return true if the two given values are negation. /// Currently can recoginze Value pair: diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index b082dfe8fbd217..16ee2ca49d0ece 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -1283,8 +1283,7 @@ AliasResult BasicAAResult::aliasGEP( // VarIndex = Scale*V. const VariableGEPIndex &Var = DecompGEP1.VarIndices[0]; if (Var.Val.TruncBits == 0 && - isKnownNonZero(Var.Val.V, /*Depth=*/0, - SimplifyQuery(DL, DT, &AC, Var.CxtI))) { + isKnownNonZero(Var.Val.V, SimplifyQuery(DL, DT, &AC, Var.CxtI))) { // Check if abs(V*Scale) >= abs(Scale) holds in the presence of // potentially wrapping math. auto MultiplyByScaleNoWrap = [](const VariableGEPIndex &Var) { diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 4e6e666922671d..8955de6375dec4 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -1586,10 +1586,10 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp, if (match(UnsignedICmp, m_c_ICmp(UnsignedPred, m_Specific(Y), m_Specific(A)))) { if (UnsignedPred == ICmpInst::ICMP_UGE && IsAnd && - EqPred == ICmpInst::ICMP_NE && isKnownNonZero(B, /*Depth=*/0, Q)) + EqPred == ICmpInst::ICMP_NE && isKnownNonZero(B, Q)) return UnsignedICmp; if (UnsignedPred == ICmpInst::ICMP_ULT && !IsAnd && - EqPred == ICmpInst::ICMP_EQ && isKnownNonZero(B, /*Depth=*/0, Q)) + EqPred == ICmpInst::ICMP_EQ && isKnownNonZero(B, Q)) return UnsignedICmp; } } @@ -1607,13 +1607,13 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp, // X > Y && Y == 0 --> Y == 0 iff X != 0 // X > Y || Y == 0 --> X > Y iff X != 0 if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ && - isKnownNonZero(X, /*Depth=*/0, Q)) + isKnownNonZero(X, Q)) return IsAnd ? ZeroICmp : UnsignedICmp; // X <= Y && Y != 0 --> X <= Y iff X != 0 // X <= Y || Y != 0 --> Y != 0 iff X != 0 if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE && - isKnownNonZero(X, /*Depth=*/0, Q)) + isKnownNonZero(X, Q)) return IsAnd ? UnsignedICmp : ZeroICmp; // The transforms below here are expected to be handled more generally with @@ -2817,10 +2817,9 @@ static Constant *computePointerICmp(CmpInst::Predicate Pred, Value *LHS, // the other operand can not be based on the alloc - if it were, then // the cmp itself would be a capture. Value *MI = nullptr; - if (isAllocLikeFn(LHS, TLI) && llvm::isKnownNonZero(RHS, /*Depth=*/0, Q)) + if (isAllocLikeFn(LHS, TLI) && llvm::isKnownNonZero(RHS, Q)) MI = LHS; - else if (isAllocLikeFn(RHS, TLI) && - llvm::isKnownNonZero(LHS, /*Depth=*/0, Q)) + else if (isAllocLikeFn(RHS, TLI) && llvm::isKnownNonZero(LHS, Q)) MI = RHS; if (MI) { // FIXME: This is incorrect, see PR54002. While we can assume that the @@ -2976,12 +2975,12 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS, return getTrue(ITy); case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_ULE: - if (isKnownNonZero(LHS, /*Depth=*/0, Q)) + if (isKnownNonZero(LHS, Q)) return getFalse(ITy); break; case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGT: - if (isKnownNonZero(LHS, /*Depth=*/0, Q)) + if (isKnownNonZero(LHS, Q)) return getTrue(ITy); break; case ICmpInst::ICMP_SLT: { @@ -2996,7 +2995,7 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS, KnownBits LHSKnown = computeKnownBits(LHS, /* Depth */ 0, Q); if (LHSKnown.isNegative()) return getTrue(ITy); - if (LHSKnown.isNonNegative() && isKnownNonZero(LHS, /*Depth=*/0, Q)) + if (LHSKnown.isNonNegative() && isKnownNonZero(LHS, Q)) return getFalse(ITy); break; } @@ -3012,7 +3011,7 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS, KnownBits LHSKnown = computeKnownBits(LHS, /* Depth */ 0, Q); if (LHSKnown.isNegative()) return getFalse(ITy); - if (LHSKnown.isNonNegative() && isKnownNonZero(LHS, /*Depth=*/0, Q)) + if (LHSKnown.isNonNegative() && isKnownNonZero(LHS, Q)) return getTrue(ITy); break; } @@ -3165,7 +3164,7 @@ static Value *simplifyICmpWithBinOpOnLHS(CmpInst::Predicate Pred, const APInt *C; if ((match(LBO, m_LShr(m_Specific(RHS), m_APInt(C))) && *C != 0) || (match(LBO, m_UDiv(m_Specific(RHS), m_APInt(C))) && *C != 1)) { - if (isKnownNonZero(RHS, /*Depth=*/0, Q)) { + if (isKnownNonZero(RHS, Q)) { switch (Pred) { default: break; @@ -3398,7 +3397,7 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, bool NUW = Q.IIQ.hasNoUnsignedWrap(LBO) && Q.IIQ.hasNoUnsignedWrap(RBO); bool NSW = Q.IIQ.hasNoSignedWrap(LBO) && Q.IIQ.hasNoSignedWrap(RBO); if (!NUW || (ICmpInst::isSigned(Pred) && !NSW) || - !isKnownNonZero(LBO->getOperand(0), /*Depth=*/0, Q)) + !isKnownNonZero(LBO->getOperand(0), Q)) break; if (Value *V = simplifyICmpInst(Pred, LBO->getOperand(1), RBO->getOperand(1), Q, MaxRecurse - 1)) diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 3223b0564e6c9d..6cded828c25f4a 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -645,7 +645,7 @@ LazyValueInfoImpl::solveBlockValueImpl(Value *Val, BasicBlock *BB) { // instruction is placed, even if it could legally be hoisted much higher. // That is unfortunate. PointerType *PT = dyn_cast(BBI->getType()); - if (PT && isKnownNonZero(BBI, /*Depth=*/0, DL)) + if (PT && isKnownNonZero(BBI, DL)) return ValueLatticeElement::getNot(ConstantPointerNull::get(PT)); if (BBI->getType()->isIntegerTy()) { @@ -1863,8 +1863,7 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C, Module *M = CxtI->getModule(); const DataLayout &DL = M->getDataLayout(); if (V->getType()->isPointerTy() && C->isNullValue() && - isKnownNonZero(V->stripPointerCastsSameRepresentation(), /*Depth=*/0, - DL)) { + isKnownNonZero(V->stripPointerCastsSameRepresentation(), DL)) { if (Pred == ICmpInst::ICMP_EQ) return LazyValueInfo::False; else if (Pred == ICmpInst::ICMP_NE) diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index b5403408cf2ab3..ac508e19c9e014 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -100,7 +100,7 @@ static bool isDereferenceableAndAlignedPointer( if (KnownDerefBytes.getBoolValue() && KnownDerefBytes.uge(Size) && !CheckForFreed) if (!CheckForNonNull || - isKnownNonZero(V, /*Depth=*/0, SimplifyQuery(DL, DT, AC, CtxI))) { + isKnownNonZero(V, SimplifyQuery(DL, DT, AC, CtxI))) { // As we recursed through GEPs to get here, we've incrementally checked // that each step advanced by a multiple of the alignment. If our base is // properly aligned, then the original offset accessed must also be. @@ -134,7 +134,7 @@ static bool isDereferenceableAndAlignedPointer( if (getObjectSize(V, ObjSize, DL, TLI, Opts)) { APInt KnownDerefBytes(Size.getBitWidth(), ObjSize); if (KnownDerefBytes.getBoolValue() && KnownDerefBytes.uge(Size) && - isKnownNonZero(V, /*Depth=*/0, SimplifyQuery(DL, DT, AC, CtxI)) && + isKnownNonZero(V, SimplifyQuery(DL, DT, AC, CtxI)) && !V->canBeFreed()) { // As we recursed through GEPs to get here, we've incrementally // checked that each step advanced by a multiple of the alignment. If diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 1c98b0295e5253..95440dda3b4c0e 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -6900,7 +6900,7 @@ const ConstantRange &ScalarEvolution::getRangeRef( uint64_t Rem = MaxVal.urem(Align); MaxVal -= APInt(BitWidth, Rem); APInt MinVal = APInt::getZero(BitWidth); - if (llvm::isKnownNonZero(V, /*Depth=*/0, DL)) + if (llvm::isKnownNonZero(V, DL)) MinVal = Align; ConservativeResult = ConservativeResult.intersectWith( ConstantRange::getNonEmpty(MinVal, MaxVal + 1), RangeType); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index b3abf016cfb93d..e91dc07f31641b 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -272,7 +272,7 @@ bool llvm::isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, } static bool isKnownNonZero(const Value *V, const APInt &DemandedElts, - unsigned Depth, const SimplifyQuery &Q); + const SimplifyQuery &Q, unsigned Depth); bool llvm::isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth) { @@ -288,7 +288,7 @@ bool llvm::isKnownPositive(const Value *V, const SimplifyQuery &SQ, // this updated. KnownBits Known = computeKnownBits(V, Depth, SQ); return Known.isNonNegative() && - (Known.isNonZero() || isKnownNonZero(V, Depth, SQ)); + (Known.isNonZero() || isKnownNonZero(V, SQ, Depth)); } bool llvm::isKnownNegative(const Value *V, const SimplifyQuery &SQ, @@ -868,7 +868,7 @@ static void computeKnownBitsFromShiftOperator( bool ShAmtNonZero = Known.isNonZero() || (Known.getMaxValue().ult(Known.getBitWidth()) && - isKnownNonZero(I->getOperand(1), DemandedElts, Depth + 1, Q)); + isKnownNonZero(I->getOperand(1), DemandedElts, Q, Depth + 1)); Known = KF(Known2, Known, ShAmtNonZero); } @@ -2124,7 +2124,7 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth, case Instruction::Mul: return isKnownToBeAPowerOfTwo(I->getOperand(1), OrZero, Depth, Q) && isKnownToBeAPowerOfTwo(I->getOperand(0), OrZero, Depth, Q) && - (OrZero || isKnownNonZero(I, Depth, Q)); + (OrZero || isKnownNonZero(I, Q, Depth)); case Instruction::And: // A power of two and'd with anything is a power of two or zero. if (OrZero && @@ -2134,7 +2134,7 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth, // X & (-X) is always a power of two or zero. if (match(I->getOperand(0), m_Neg(m_Specific(I->getOperand(1)))) || match(I->getOperand(1), m_Neg(m_Specific(I->getOperand(0))))) - return OrZero || isKnownNonZero(I->getOperand(0), Depth, Q); + return OrZero || isKnownNonZero(I->getOperand(0), Q, Depth); return false; case Instruction::Add: { // Adding a power-of-two or zero to the same power-of-two or zero yields @@ -2249,7 +2249,7 @@ static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth, // If the base pointer is non-null, we cannot walk to a null address with an // inbounds GEP in address space zero. - if (isKnownNonZero(GEP->getPointerOperand(), Depth, Q)) + if (isKnownNonZero(GEP->getPointerOperand(), Q, Depth)) return true; // Walk the GEP operands and see if any operand introduces a non-zero offset. @@ -2288,7 +2288,7 @@ static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth, if (Depth++ >= MaxAnalysisRecursionDepth) continue; - if (isKnownNonZero(GTI.getOperand(), Depth, Q)) + if (isKnownNonZero(GTI.getOperand(), Q, Depth)) return true; } @@ -2441,8 +2441,8 @@ static bool isNonZeroAdd(const APInt &DemandedElts, unsigned Depth, const SimplifyQuery &Q, unsigned BitWidth, Value *X, Value *Y, bool NSW, bool NUW) { if (NUW) - return isKnownNonZero(Y, DemandedElts, Depth, Q) || - isKnownNonZero(X, DemandedElts, Depth, Q); + return isKnownNonZero(Y, DemandedElts, Q, Depth) || + isKnownNonZero(X, DemandedElts, Q, Depth); KnownBits XKnown = computeKnownBits(X, DemandedElts, Depth, Q); KnownBits YKnown = computeKnownBits(Y, DemandedElts, Depth, Q); @@ -2450,8 +2450,8 @@ static bool isNonZeroAdd(const APInt &DemandedElts, unsigned Depth, // If X and Y are both non-negative (as signed values) then their sum is not // zero unless both X and Y are zero. if (XKnown.isNonNegative() && YKnown.isNonNegative()) - if (isKnownNonZero(Y, DemandedElts, Depth, Q) || - isKnownNonZero(X, DemandedElts, Depth, Q)) + if (isKnownNonZero(Y, DemandedElts, Q, Depth) || + isKnownNonZero(X, DemandedElts, Q, Depth)) return true; // If X and Y are both negative (as signed values) then their sum is not @@ -2485,7 +2485,7 @@ static bool isNonZeroSub(const APInt &DemandedElts, unsigned Depth, Value *Y) { // TODO: Move this case into isKnownNonEqual(). if (auto *C = dyn_cast(X)) - if (C->isNullValue() && isKnownNonZero(Y, DemandedElts, Depth, Q)) + if (C->isNullValue() && isKnownNonZero(Y, DemandedElts, Q, Depth)) return true; return ::isKnownNonEqual(X, Y, Depth, Q); @@ -2497,18 +2497,18 @@ static bool isNonZeroMul(const APInt &DemandedElts, unsigned Depth, // If X and Y are non-zero then so is X * Y as long as the multiplication // does not overflow. if (NSW || NUW) - return isKnownNonZero(X, DemandedElts, Depth, Q) && - isKnownNonZero(Y, DemandedElts, Depth, Q); + return isKnownNonZero(X, DemandedElts, Q, Depth) && + isKnownNonZero(Y, DemandedElts, Q, Depth); // If either X or Y is odd, then if the other is non-zero the result can't // be zero. KnownBits XKnown = computeKnownBits(X, DemandedElts, Depth, Q); if (XKnown.One[0]) - return isKnownNonZero(Y, DemandedElts, Depth, Q); + return isKnownNonZero(Y, DemandedElts, Q, Depth); KnownBits YKnown = computeKnownBits(Y, DemandedElts, Depth, Q); if (YKnown.One[0]) - return XKnown.isNonZero() || isKnownNonZero(X, DemandedElts, Depth, Q); + return XKnown.isNonZero() || isKnownNonZero(X, DemandedElts, Q, Depth); // If there exists any subset of X (sX) and subset of Y (sY) s.t sX * sY is // non-zero, then X * Y is non-zero. We can find sX and sY by just taking @@ -2564,7 +2564,7 @@ static bool isNonZeroShift(const Operator *I, const APInt &DemandedElts, // non-zero then at least one non-zero bit must remain. if (InvShiftOp(KnownVal.Zero, NumBits - MaxShift) .eq(InvShiftOp(APInt::getAllOnes(NumBits), NumBits - MaxShift)) && - isKnownNonZero(I->getOperand(0), DemandedElts, Depth, Q)) + isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth)) return true; return false; @@ -2613,7 +2613,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, Type *FromTy = I->getOperand(0)->getType(); if ((FromTy->isIntOrIntVectorTy() || FromTy->isPtrOrPtrVectorTy()) && (BitWidth % getBitWidth(FromTy->getScalarType(), Q.DL)) == 0) - return isKnownNonZero(I->getOperand(0), Depth, Q); + return isKnownNonZero(I->getOperand(0), Q, Depth); } break; case Instruction::IntToPtr: // Note that we have to take special care to avoid looking through @@ -2622,7 +2622,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, if (!isa(I->getType()) && Q.DL.getTypeSizeInBits(I->getOperand(0)->getType()).getFixedValue() <= Q.DL.getTypeSizeInBits(I->getType()).getFixedValue()) - return isKnownNonZero(I->getOperand(0), Depth, Q); + return isKnownNonZero(I->getOperand(0), Q, Depth); break; case Instruction::PtrToInt: // Similar to int2ptr above, we can look through ptr2int here if the cast @@ -2630,25 +2630,25 @@ static bool isKnownNonZeroFromOperator(const Operator *I, if (!isa(I->getType()) && Q.DL.getTypeSizeInBits(I->getOperand(0)->getType()).getFixedValue() <= Q.DL.getTypeSizeInBits(I->getType()).getFixedValue()) - return isKnownNonZero(I->getOperand(0), Depth, Q); + return isKnownNonZero(I->getOperand(0), Q, Depth); break; case Instruction::Sub: return isNonZeroSub(DemandedElts, Depth, Q, BitWidth, I->getOperand(0), I->getOperand(1)); case Instruction::Or: // X | Y != 0 if X != 0 or Y != 0. - return isKnownNonZero(I->getOperand(1), DemandedElts, Depth, Q) || - isKnownNonZero(I->getOperand(0), DemandedElts, Depth, Q); + return isKnownNonZero(I->getOperand(1), DemandedElts, Q, Depth) || + isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth); case Instruction::SExt: case Instruction::ZExt: // ext X != 0 if X != 0. - return isKnownNonZero(I->getOperand(0), Depth, Q); + return isKnownNonZero(I->getOperand(0), Q, Depth); case Instruction::Shl: { // shl nsw/nuw can't remove any non-zero bits. const OverflowingBinaryOperator *BO = cast(I); if (Q.IIQ.hasNoUnsignedWrap(BO) || Q.IIQ.hasNoSignedWrap(BO)) - return isKnownNonZero(I->getOperand(0), Depth, Q); + return isKnownNonZero(I->getOperand(0), Q, Depth); // shl X, Y != 0 if X is odd. Note that the value of the shift is undefined // if the lowest bit is shifted off the end. @@ -2664,7 +2664,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, // shr exact can only shift out zero bits. const PossiblyExactOperator *BO = cast(I); if (BO->isExact()) - return isKnownNonZero(I->getOperand(0), Depth, Q); + return isKnownNonZero(I->getOperand(0), Q, Depth); // shr X, Y != 0 if X is negative. Note that the value of the shift is not // defined if the sign bit is shifted off the end. @@ -2680,7 +2680,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, // X / Y // div exact can only produce a zero if the dividend is zero. if (cast(I)->isExact()) - return isKnownNonZero(I->getOperand(0), DemandedElts, Depth, Q); + return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth); std::optional XUgeY; KnownBits XKnown = @@ -2730,7 +2730,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, Value *Op; Op = IsTrueArm ? I->getOperand(1) : I->getOperand(2); // Op is trivially non-zero. - if (isKnownNonZero(Op, DemandedElts, Depth, Q)) + if (isKnownNonZero(Op, DemandedElts, Q, Depth)) return true; // The condition of the select dominates the true/false arm. Check if the @@ -2780,7 +2780,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, } } // Finally recurse on the edge and check it directly. - return isKnownNonZero(U.get(), DemandedElts, NewDepth, RecQ); + return isKnownNonZero(U.get(), DemandedElts, RecQ, NewDepth); }); } case Instruction::InsertElement: { @@ -2802,9 +2802,9 @@ static bool isKnownNonZeroFromOperator(const Operator *I, // Result is zero if Elt is non-zero and rest of the demanded elts in Vec // are non-zero. - return (SkipElt || isKnownNonZero(Elt, Depth, Q)) && + return (SkipElt || isKnownNonZero(Elt, Q, Depth)) && (DemandedVecElts.isZero() || - isKnownNonZero(Vec, DemandedVecElts, Depth, Q)); + isKnownNonZero(Vec, DemandedVecElts, Q, Depth)); } case Instruction::ExtractElement: if (const auto *EEI = dyn_cast(I)) { @@ -2816,7 +2816,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, APInt DemandedVecElts = APInt::getAllOnes(NumElts); if (CIdx && CIdx->getValue().ult(NumElts)) DemandedVecElts = APInt::getOneBitSet(NumElts, CIdx->getZExtValue()); - return isKnownNonZero(Vec, DemandedVecElts, Depth, Q); + return isKnownNonZero(Vec, DemandedVecElts, Q, Depth); } } break; @@ -2831,12 +2831,12 @@ static bool isKnownNonZeroFromOperator(const Operator *I, break; // If demanded elements for both vecs are non-zero, the shuffle is non-zero. return (DemandedRHS.isZero() || - isKnownNonZero(Shuf->getOperand(1), DemandedRHS, Depth, Q)) && + isKnownNonZero(Shuf->getOperand(1), DemandedRHS, Q, Depth)) && (DemandedLHS.isZero() || - isKnownNonZero(Shuf->getOperand(0), DemandedLHS, Depth, Q)); + isKnownNonZero(Shuf->getOperand(0), DemandedLHS, Q, Depth)); } case Instruction::Freeze: - return isKnownNonZero(I->getOperand(0), Depth, Q) && + return isKnownNonZero(I->getOperand(0), Q, Depth) && isGuaranteedNotToBePoison(I->getOperand(0), Q.AC, Q.CxtI, Q.DT, Depth); case Instruction::Load: { @@ -2886,7 +2886,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, if (Call->isReturnNonNull()) return true; if (const auto *RP = getArgumentAliasingToReturnedPointer(Call, true)) - return isKnownNonZero(RP, Depth, Q); + return isKnownNonZero(RP, Q, Depth); } else { if (MDNode *Ranges = Q.IIQ.getMetadata(Call, LLVMContext::MD_range)) return rangeMetadataExcludesValue(Ranges, APInt::getZero(BitWidth)); @@ -2896,7 +2896,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, return true; } if (const Value *RV = Call->getReturnedArgOperand()) - if (RV->getType() == I->getType() && isKnownNonZero(RV, Depth, Q)) + if (RV->getType() == I->getType() && isKnownNonZero(RV, Q, Depth)) return true; } @@ -2908,7 +2908,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, case Intrinsic::bitreverse: case Intrinsic::bswap: case Intrinsic::ctpop: - return isKnownNonZero(II->getArgOperand(0), DemandedElts, Depth, Q); + return isKnownNonZero(II->getArgOperand(0), DemandedElts, Q, Depth); // NB: We don't do usub_sat here as in any case we can prove its // non-zero, we will fold it to `sub nuw` in InstCombine. case Intrinsic::ssub_sat: @@ -2924,11 +2924,11 @@ static bool isKnownNonZeroFromOperator(const Operator *I, case Intrinsic::vector_reduce_umin: case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_smin: - return isKnownNonZero(II->getArgOperand(0), Depth, Q); + return isKnownNonZero(II->getArgOperand(0), Q, Depth); case Intrinsic::umax: case Intrinsic::uadd_sat: - return isKnownNonZero(II->getArgOperand(1), DemandedElts, Depth, Q) || - isKnownNonZero(II->getArgOperand(0), DemandedElts, Depth, Q); + return isKnownNonZero(II->getArgOperand(1), DemandedElts, Q, Depth) || + isKnownNonZero(II->getArgOperand(0), DemandedElts, Q, Depth); case Intrinsic::smax: { // If either arg is strictly positive the result is non-zero. Otherwise // the result is non-zero if both ops are non-zero. @@ -2936,7 +2936,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, const KnownBits &OpKnown) { if (!OpNonZero.has_value()) OpNonZero = OpKnown.isNonZero() || - isKnownNonZero(Op, DemandedElts, Depth, Q); + isKnownNonZero(Op, DemandedElts, Q, Depth); return *OpNonZero; }; // Avoid re-computing isKnownNonZero. @@ -2971,8 +2971,8 @@ static bool isKnownNonZeroFromOperator(const Operator *I, } [[fallthrough]]; case Intrinsic::umin: - return isKnownNonZero(II->getArgOperand(0), DemandedElts, Depth, Q) && - isKnownNonZero(II->getArgOperand(1), DemandedElts, Depth, Q); + return isKnownNonZero(II->getArgOperand(0), DemandedElts, Q, Depth) && + isKnownNonZero(II->getArgOperand(1), DemandedElts, Q, Depth); case Intrinsic::cttz: return computeKnownBits(II->getArgOperand(0), DemandedElts, Depth, Q) .Zero[0]; @@ -2983,12 +2983,12 @@ static bool isKnownNonZeroFromOperator(const Operator *I, case Intrinsic::fshl: // If Op0 == Op1, this is a rotate. rotate(x, y) != 0 iff x != 0. if (II->getArgOperand(0) == II->getArgOperand(1)) - return isKnownNonZero(II->getArgOperand(0), DemandedElts, Depth, Q); + return isKnownNonZero(II->getArgOperand(0), DemandedElts, Q, Depth); break; case Intrinsic::vscale: return true; case Intrinsic::experimental_get_vector_length: - return isKnownNonZero(I->getOperand(0), Depth, Q); + return isKnownNonZero(I->getOperand(0), Q, Depth); default: break; } @@ -3010,8 +3010,8 @@ static bool isKnownNonZeroFromOperator(const Operator *I, /// specified, perform context-sensitive analysis and return true if the /// pointer couldn't possibly be null at the specified instruction. /// Supports values with integer or pointer type and vectors of integers. -bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth, - const SimplifyQuery &Q) { +bool isKnownNonZero(const Value *V, const APInt &DemandedElts, + const SimplifyQuery &Q, unsigned Depth) { Type *Ty = V->getType(); #ifndef NDEBUG @@ -3101,12 +3101,12 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth, return false; } -bool llvm::isKnownNonZero(const Value *V, unsigned Depth, - const SimplifyQuery &Q) { +bool llvm::isKnownNonZero(const Value *V, const SimplifyQuery &Q, + unsigned Depth) { auto *FVTy = dyn_cast(V->getType()); APInt DemandedElts = FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1); - return ::isKnownNonZero(V, DemandedElts, Depth, Q); + return ::isKnownNonZero(V, DemandedElts, Q, Depth); } /// If the pair of operators are the same invertible function, return the @@ -3253,7 +3253,7 @@ static bool isModifyingBinopOfNonZero(const Value *V1, const Value *V2, Op = BO->getOperand(0); else return false; - return isKnownNonZero(Op, Depth + 1, Q); + return isKnownNonZero(Op, Q, Depth + 1); } return false; } @@ -3266,7 +3266,7 @@ static bool isNonEqualMul(const Value *V1, const Value *V2, unsigned Depth, const APInt *C; return match(OBO, m_Mul(m_Specific(V1), m_APInt(C))) && (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) && - !C->isZero() && !C->isOne() && isKnownNonZero(V1, Depth + 1, Q); + !C->isZero() && !C->isOne() && isKnownNonZero(V1, Q, Depth + 1); } return false; } @@ -3279,7 +3279,7 @@ static bool isNonEqualShl(const Value *V1, const Value *V2, unsigned Depth, const APInt *C; return match(OBO, m_Shl(m_Specific(V1), m_APInt(C))) && (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) && - !C->isZero() && isKnownNonZero(V1, Depth + 1, Q); + !C->isZero() && isKnownNonZero(V1, Q, Depth + 1); } return false; } diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 22dbb3198a9f17..e657872c382848 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2314,7 +2314,7 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros, // Bail if the value is never zero. Use &Op = CountZeros->getOperandUse(0); - if (isKnownNonZero(Op, /*Depth=*/0, *DL)) + if (isKnownNonZero(Op, *DL)) return false; // The intrinsic will be sunk behind a compare against zero and branch. diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index f27d8d64a10404..41b66aafe7d343 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -2453,7 +2453,7 @@ bool AANonNull::isImpliedByIR(Attributor &A, const IRPosition &IRP, if (llvm::any_of(Worklist, [&](AA::ValueAndContext VAC) { return !isKnownNonZero( - VAC.getValue(), /*Depth=*/0, + VAC.getValue(), SimplifyQuery(A.getDataLayout(), DT, AC, VAC.getCtxI())); })) return false; diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 14612b251d1a42..7ebf265e17ba1f 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -1175,7 +1175,7 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, Value *RetVal = FlowsToReturn[i]; // If this value is locally known to be non-null, we're good - if (isKnownNonZero(RetVal, /*Depth=*/0, DL)) + if (isKnownNonZero(RetVal, DL)) continue; // Otherwise, we need to look upwards since we can't make any local diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 07c50d866544b3..c59b867b10e7d1 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -988,7 +988,7 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) { if (C->isOne()) { if (match(Op0, m_ZExt(m_Add(m_Value(X), m_AllOnes())))) { const SimplifyQuery Q = SQ.getWithInstruction(&Add); - if (llvm::isKnownNonZero(X, /*Depth=*/0, Q)) + if (llvm::isKnownNonZero(X, Q)) return new ZExtInst(X, Ty); } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 2c0c4ee46e8098..d311690be64f16 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1039,9 +1039,9 @@ static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp, match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) && (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) { auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) { - if (!isKnownNonZero(NonZero, /*Depth=*/0, Q)) + if (!isKnownNonZero(NonZero, Q)) std::swap(NonZero, Other); - return isKnownNonZero(NonZero, /*Depth=*/0, Q); + return isKnownNonZero(NonZero, Q); }; // Given ZeroCmpOp = (A + B) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index ba5db854647a42..60e4be883f513b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -601,8 +601,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { // then change the 'ZeroIsPoison' parameter to 'true' // because we know the zero behavior can't affect the result. if (!Known.One.isZero() || - isKnownNonZero(Op0, /*Depth=*/0, - IC.getSimplifyQuery().getWithInstruction(&II))) { + isKnownNonZero(Op0, IC.getSimplifyQuery().getWithInstruction(&II))) { if (!match(II.getArgOperand(1), m_One())) return IC.replaceOperand(II, 1, IC.Builder.getTrue()); } @@ -2067,8 +2066,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // See if we can deduce non-null. if (!CI.hasRetAttr(Attribute::NonNull) && (Known.isNonZero() || - isKnownNonZero(II, /*Depth=*/0, - getSimplifyQuery().getWithInstruction(II)))) { + isKnownNonZero(II, getSimplifyQuery().getWithInstruction(II)))) { CI.addRetAttr(Attribute::NonNull); Changed = true; } @@ -3664,8 +3662,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { for (Value *V : Call.args()) { if (V->getType()->isPointerTy() && !Call.paramHasAttr(ArgNo, Attribute::NonNull) && - isKnownNonZero(V, /*Depth=*/0, - getSimplifyQuery().getWithInstruction(&Call))) + isKnownNonZero(V, getSimplifyQuery().getWithInstruction(&Call))) ArgNos.push_back(ArgNo); ArgNo++; } @@ -3845,7 +3842,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { // isKnownNonNull -> nonnull attribute if (!GCR.hasRetAttr(Attribute::NonNull) && - isKnownNonZero(DerivedPtr, /*Depth=*/0, + isKnownNonZero(DerivedPtr, getSimplifyQuery().getWithInstruction(&Call))) { GCR.addRetAttr(Attribute::NonNull); // We discovered new fact, re-check users. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index ee783eed190a7c..de909077017432 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1273,12 +1273,12 @@ Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) { // if X non-zero and NoOverflow(X * Y) // (icmp eq/ne Y) - if (!XKnown.One.isZero() || isKnownNonZero(X, /*Depth=*/0, Q)) + if (!XKnown.One.isZero() || isKnownNonZero(X, Q)) return new ICmpInst(Pred, Y, Cmp.getOperand(1)); // if Y non-zero and NoOverflow(X * Y) // (icmp eq/ne X) - if (!YKnown.One.isZero() || isKnownNonZero(Y, /*Depth=*/0, Q)) + if (!YKnown.One.isZero() || isKnownNonZero(Y, Q)) return new ICmpInst(Pred, X, Cmp.getOperand(1)); } // Note, we are skipping cases: @@ -3087,7 +3087,7 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp, // (X + -1) X <=u C (if X is never null) if (Pred == CmpInst::ICMP_ULT && C2->isAllOnes()) { const SimplifyQuery Q = SQ.getWithInstruction(&Cmp); - if (llvm::isKnownNonZero(X, /*Depth=*/0, Q)) + if (llvm::isKnownNonZero(X, Q)) return new ICmpInst(ICmpInst::ICMP_ULE, X, ConstantInt::get(Ty, C)); } @@ -4275,7 +4275,7 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst::Predicate Pred, Value *Op0, // Look for: x & ~Mask pred ~Mask if (isMaskOrZero(X, /*Not=*/true, Q)) { - return !ICmpInst::isSigned(Pred) || isKnownNonZero(X, /*Depth=*/0, Q); + return !ICmpInst::isSigned(Pred) || isKnownNonZero(X, Q); } return false; } @@ -4779,7 +4779,7 @@ static Instruction *foldICmpXorXX(ICmpInst &I, const SimplifyQuery &Q, // icmp (X ^ Y_NonZero) s>= X --> icmp (X ^ Y_NonZero) s> X // icmp (X ^ Y_NonZero) s<= X --> icmp (X ^ Y_NonZero) s< X CmpInst::Predicate PredOut = CmpInst::getStrictPredicate(Pred); - if (PredOut != Pred && isKnownNonZero(A, /*Depth=*/0, Q)) + if (PredOut != Pred && isKnownNonZero(A, Q)) return new ICmpInst(PredOut, Op0, Op1); return nullptr; @@ -5062,11 +5062,11 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I, return new ICmpInst(Pred, C, D); // (A - B) u>=/u< A --> B u>/u<= A iff B != 0 if (A == Op1 && (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_ULT) && - isKnownNonZero(B, /*Depth=*/0, Q)) + isKnownNonZero(B, Q)) return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), B, A); // C u<=/u> (C - D) --> C u= D iff B != 0 if (C == Op0 && (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT) && - isKnownNonZero(D, /*Depth=*/0, Q)) + isKnownNonZero(D, Q)) return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), C, D); // icmp (A-B), (C-B) -> icmp A, C for equalities or if there is no overflow. @@ -5108,13 +5108,13 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I, // X * Z eq/ne Y * Z -> X eq/ne Y if (ZKnown.countMaxTrailingZeros() == 0) return new ICmpInst(Pred, X, Y); - NonZero = !ZKnown.One.isZero() || isKnownNonZero(Z, /*Depth=*/0, Q); + NonZero = !ZKnown.One.isZero() || isKnownNonZero(Z, Q); // if Z != 0 and nsw(X * Z) and nsw(Y * Z) // X * Z eq/ne Y * Z -> X eq/ne Y if (NonZero && BO0 && BO1 && Op0HasNSW && Op1HasNSW) return new ICmpInst(Pred, X, Y); } else - NonZero = isKnownNonZero(Z, /*Depth=*/0, Q); + NonZero = isKnownNonZero(Z, Q); // If Z != 0 and nuw(X * Z) and nuw(Y * Z) // X * Z u{lt/le/gt/ge}/eq/ne Y * Z -> X u{lt/le/gt/ge}/eq/ne Y diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 9838e2aa9f3a24..52803e9bea451e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -1537,8 +1537,7 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) { for (unsigned I = 0, E = PN.getNumIncomingValues(); I != E; ++I) { Instruction *CtxI = PN.getIncomingBlock(I)->getTerminator(); Value *VA = PN.getIncomingValue(I); - if (isKnownNonZero(VA, 0, - getSimplifyQuery().getWithInstruction(CtxI))) { + if (isKnownNonZero(VA, getSimplifyQuery().getWithInstruction(CtxI))) { if (!NonZeroConst) NonZeroConst = getAnyNonZeroConstInt(PN); if (NonZeroConst != VA) { diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 4c00f2a0ea1761..5a144cc7378962 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1431,7 +1431,7 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCastsFromSign( if (OpsKnown[OpNo].hasKnownBits() && OpsKnown[OpNo].getKnownBits(SQ).isNonZero()) return true; - return isKnownNonZero(IntOps[OpNo], /*Depth=*/0, SQ); + return isKnownNonZero(IntOps[OpNo], SQ); }; auto IsNonNeg = [&](unsigned OpNo) -> bool { diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index a72b0ee9a08e01..ee3531bbd68df3 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -1281,7 +1281,7 @@ struct MemorySanitizerVisitor : public InstVisitor { // ignored. return; } - if (llvm::isKnownNonZero(ConvertedShadow, /*Depth=*/0, DL)) { + if (llvm::isKnownNonZero(ConvertedShadow, DL)) { // Copy origin as the value is definitely uninitialized. paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, OriginAlignment); @@ -1427,7 +1427,7 @@ struct MemorySanitizerVisitor : public InstVisitor { // Skip, value is initialized or const shadow is ignored. continue; } - if (llvm::isKnownNonZero(ConvertedShadow, /*Depth=*/0, DL)) { + if (llvm::isKnownNonZero(ConvertedShadow, DL)) { // Report as the value is definitely uninitialized. insertWarningFn(IRB, ShadowData.Origin); if (!MS.Recover) diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index f376b5f7d68d4a..40d0f6b75d69b0 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -459,7 +459,7 @@ static void convertMetadataToAssumes(LoadInst *LI, Value *Val, // we can only do this if the value is known non-poison. if (AC && LI->getMetadata(LLVMContext::MD_nonnull) && LI->getMetadata(LLVMContext::MD_noundef) && - !isKnownNonZero(Val, /*Depth=*/0, SimplifyQuery(DL, DT, AC, LI))) + !isKnownNonZero(Val, SimplifyQuery(DL, DT, AC, LI))) addAssumeNonNull(AC, LI); } diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 7e9e91606fe22d..2e68a9c01898c8 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -305,7 +305,7 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef A if (ConstantInt *LenC = dyn_cast(Size)) { annotateNonNullNoUndefBasedOnAccess(CI, ArgNos); annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue()); - } else if (isKnownNonZero(Size, /*Depth=*/0, DL)) { + } else if (isKnownNonZero(Size, DL)) { annotateNonNullNoUndefBasedOnAccess(CI, ArgNos); const APInt *X, *Y; uint64_t DerefMin = 1; @@ -394,7 +394,7 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) { Value *Size = CI->getArgOperand(2); uint64_t Len; annotateNonNullNoUndefBasedOnAccess(CI, 0); - if (isKnownNonZero(Size, /*Depth=*/0, DL)) + if (isKnownNonZero(Size, DL)) annotateNonNullNoUndefBasedOnAccess(CI, 1); // We don't do anything if length is not constant. @@ -613,7 +613,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) { if (Str1P == Str2P) // strncmp(x,x,n) -> 0 return ConstantInt::get(CI->getType(), 0); - if (isKnownNonZero(Size, /*Depth=*/0, DL)) + if (isKnownNonZero(Size, DL)) annotateNonNullNoUndefBasedOnAccess(CI, {0, 1}); // Get the length argument if it is constant. uint64_t Length; @@ -749,7 +749,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) { Value *LibCallSimplifier::optimizeStrLCpy(CallInst *CI, IRBuilderBase &B) { Value *Size = CI->getArgOperand(2); - if (isKnownNonZero(Size, /*Depth=*/0, DL)) + if (isKnownNonZero(Size, DL)) // Like snprintf, the function stores into the destination only when // the size argument is nonzero. annotateNonNullNoUndefBasedOnAccess(CI, 0); @@ -833,7 +833,7 @@ Value *LibCallSimplifier::optimizeStringNCpy(CallInst *CI, bool RetEnd, Value *Src = CI->getArgOperand(1); Value *Size = CI->getArgOperand(2); - if (isKnownNonZero(Size, /*Depth=*/0, DL)) { + if (isKnownNonZero(Size, DL)) { // Both st{p,r}ncpy(D, S, N) access the source and destination arrays // only when N is nonzero. annotateNonNullNoUndefBasedOnAccess(CI, 0); @@ -926,7 +926,7 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B, Type *CharTy = B.getIntNTy(CharSize); if (isOnlyUsedInZeroEqualityComparison(CI) && - (!Bound || isKnownNonZero(Bound, /*Depth=*/0, DL))) { + (!Bound || isKnownNonZero(Bound, DL))) { // Fold strlen: // strlen(x) != 0 --> *x != 0 // strlen(x) == 0 --> *x == 0 @@ -1047,7 +1047,7 @@ Value *LibCallSimplifier::optimizeStrNLen(CallInst *CI, IRBuilderBase &B) { if (Value *V = optimizeStringLength(CI, B, 8, Bound)) return V; - if (isKnownNonZero(Bound, /*Depth=*/0, DL)) + if (isKnownNonZero(Bound, DL)) annotateNonNullNoUndefBasedOnAccess(CI, 0); return nullptr; } @@ -1291,7 +1291,7 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) { Value *SrcStr = CI->getArgOperand(0); Value *Size = CI->getArgOperand(2); - if (isKnownNonZero(Size, /*Depth=*/0, DL)) { + if (isKnownNonZero(Size, DL)) { annotateNonNullNoUndefBasedOnAccess(CI, 0); if (isOnlyUsedInEqualityComparison(CI, SrcStr)) return memChrToCharCompare(CI, Size, B, DL); @@ -2976,7 +2976,7 @@ Value *LibCallSimplifier::optimizeStrToInt(CallInst *CI, IRBuilderBase &B, // It would be readonly too, except that it still may write to errno. CI->addParamAttr(0, Attribute::NoCapture); EndPtr = nullptr; - } else if (!isKnownNonZero(EndPtr, /*Depth=*/0, DL)) + } else if (!isKnownNonZero(EndPtr, DL)) return nullptr; StringRef Str; @@ -3402,7 +3402,7 @@ Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilderBase &B) { return V; } - if (isKnownNonZero(CI->getOperand(1), /*Depth=*/0, DL)) + if (isKnownNonZero(CI->getOperand(1), DL)) annotateNonNullNoUndefBasedOnAccess(CI, 0); return nullptr; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index e0e2f50c89adad..4918cee1fa82a3 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -886,7 +886,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode( *FunctionalOpcode, &VPI, nullptr, &AC, &DT); if (!SafeToSpeculate && - !isKnownNonZero(EVL, /*Depth=*/0, SimplifyQuery(*DL, &DT, &AC, &VPI))) + !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI))) return false; Value *ScalarVal = diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp index 8ebd9b511f39fe..8738af91b652b8 100644 --- a/llvm/unittests/Analysis/ValueTrackingTest.cpp +++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp @@ -2110,8 +2110,7 @@ TEST_F(ValueTrackingTest, isNonZeroRecurrence) { )"); const DataLayout &DL = M->getDataLayout(); AssumptionCache AC(*F); - EXPECT_TRUE(isKnownNonZero(A, /*Depth=*/0, - SimplifyQuery(DL, /*DT=*/nullptr, &AC, CxtI))); + EXPECT_TRUE(isKnownNonZero(A, SimplifyQuery(DL, /*DT=*/nullptr, &AC, CxtI))); } TEST_F(ValueTrackingTest, KnownNonZeroFromDomCond) { @@ -2135,9 +2134,8 @@ TEST_F(ValueTrackingTest, KnownNonZeroFromDomCond) { DominatorTree DT(*F); const DataLayout &DL = M->getDataLayout(); const SimplifyQuery SQ(DL, &DT, &AC); - EXPECT_EQ(isKnownNonZero(A, /*Depth=*/0, SQ.getWithInstruction(CxtI)), true); - EXPECT_EQ(isKnownNonZero(A, /*Depth=*/0, SQ.getWithInstruction(CxtI2)), - false); + EXPECT_EQ(isKnownNonZero(A, SQ.getWithInstruction(CxtI)), true); + EXPECT_EQ(isKnownNonZero(A, SQ.getWithInstruction(CxtI2)), false); } TEST_F(ValueTrackingTest, KnownNonZeroFromDomCond2) { @@ -2161,9 +2159,8 @@ TEST_F(ValueTrackingTest, KnownNonZeroFromDomCond2) { DominatorTree DT(*F); const DataLayout &DL = M->getDataLayout(); const SimplifyQuery SQ(DL, &DT, &AC); - EXPECT_EQ(isKnownNonZero(A, /*Depth=*/0, SQ.getWithInstruction(CxtI)), true); - EXPECT_EQ(isKnownNonZero(A, /*Depth=*/0, SQ.getWithInstruction(CxtI2)), - false); + EXPECT_EQ(isKnownNonZero(A, SQ.getWithInstruction(CxtI)), true); + EXPECT_EQ(isKnownNonZero(A, SQ.getWithInstruction(CxtI2)), false); } TEST_F(ValueTrackingTest, IsImpliedConditionAnd) { From 5a34ff12b8f4a73f5dcd4be1b2575dc38cf13bee Mon Sep 17 00:00:00 2001 From: Jeremy Kun Date: Tue, 16 Apr 2024 07:35:36 -0700 Subject: [PATCH 110/300] fix Polynomial.td doc filename (#88900) Not sure how best to test this, but I think it fixes the error https://github.com/llvm/mlir-www/actions/runs/8699908058/job/23859264085#step:7:1111 Co-authored-by: Jeremy Kun Co-authored-by: Jacques Pienaar --- mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt index d8039deb5ee217..dd0384d8b79d66 100644 --- a/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt @@ -1,8 +1,8 @@ add_mlir_dialect(Polynomial polynomial) -add_mlir_doc(PolynomialDialect PolynomialDialect Polynomial/ -gen-dialect-doc) -add_mlir_doc(PolynomialOps PolynomialOps Polynomial/ -gen-op-doc) -add_mlir_doc(PolynomialAttributes PolynomialAttributes Dialects/ -gen-attrdef-doc) -add_mlir_doc(PolynomialTypes PolynomialTypes Dialects/ -gen-typedef-doc) +add_mlir_doc(Polynomial PolynomialDialect Polynomial/ -gen-dialect-doc) +add_mlir_doc(Polynomial PolynomialOps Polynomial/ -gen-op-doc) +add_mlir_doc(Polynomial PolynomialAttributes Dialects/ -gen-attrdef-doc) +add_mlir_doc(Polynomial PolynomialTypes Dialects/ -gen-typedef-doc) set(LLVM_TARGET_DEFINITIONS Polynomial.td) mlir_tablegen(PolynomialAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=polynomial) From b63247627c9e87e898dec5bf0bea255b3f0eec5c Mon Sep 17 00:00:00 2001 From: Ding Fei Date: Tue, 16 Apr 2024 22:38:27 +0800 Subject: [PATCH 111/300] [AST][RecoveryExpr] Fix a crash on c89/c90 invalid InitListExpr (#88008) (#88014) Use refactored `CheckForConstantInitializer()` to skip checking expr with error. --------- Co-authored-by: Aaron Ballman --- clang/docs/ReleaseNotes.rst | 2 ++ clang/include/clang/Sema/Sema.h | 4 ++- clang/lib/Sema/SemaDecl.cpp | 28 ++++++++----------- clang/lib/Sema/SemaExpr.cpp | 2 +- .../test/Sema/recover-expr-gh88008-nocrash.c | 11 ++++++++ 5 files changed, 28 insertions(+), 19 deletions(-) create mode 100644 clang/test/Sema/recover-expr-gh88008-nocrash.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index db90db6fa4ab0e..d8ec8bcb8df532 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -417,6 +417,8 @@ Bug Fixes in This Version - Fixed a regression in CTAD that a friend declaration that befriends itself may cause incorrect constraint substitution. (#GH86769). +- Fixed an assertion failure on invalid InitListExpr in C89 mode (#GH88008). + Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index a5fe83a539aaf8..77150a318ee47d 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -55,6 +55,7 @@ #include "clang/Sema/Scope.h" #include "clang/Sema/SemaBase.h" #include "clang/Sema/SemaConcept.h" +#include "clang/Sema/SemaDiagnostic.h" #include "clang/Sema/TypoCorrection.h" #include "clang/Sema/Weak.h" #include "llvm/ADT/ArrayRef.h" @@ -3427,7 +3428,8 @@ class Sema final : public SemaBase { bool ConstexprSupported, bool CLinkageMayDiffer); /// type checking declaration initializers (C99 6.7.8) - bool CheckForConstantInitializer(Expr *e, QualType t); + bool CheckForConstantInitializer( + Expr *Init, unsigned DiagID = diag::err_init_element_not_constant); QualType deduceVarTypeFromInitializer(VarDecl *VDecl, DeclarationName Name, QualType Type, TypeSourceInfo *TSI, diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 390da508518e16..745cf41e204e7a 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -12671,7 +12671,7 @@ void Sema::CheckMSVCRTEntryPoint(FunctionDecl *FD) { } } -bool Sema::CheckForConstantInitializer(Expr *Init, QualType DclT) { +bool Sema::CheckForConstantInitializer(Expr *Init, unsigned DiagID) { // FIXME: Need strict checking. In C89, we need to check for // any assignment, increment, decrement, function-calls, or // commas outside of a sizeof. In C99, it's the same list, @@ -12689,8 +12689,7 @@ bool Sema::CheckForConstantInitializer(Expr *Init, QualType DclT) { const Expr *Culprit; if (Init->isConstantInitializer(Context, false, &Culprit)) return false; - Diag(Culprit->getExprLoc(), diag::err_init_element_not_constant) - << Culprit->getSourceRange(); + Diag(Culprit->getExprLoc(), DiagID) << Culprit->getSourceRange(); return true; } @@ -13808,29 +13807,24 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { // OpenCL v1.2 s6.5.3: __constant locals must be constant-initialized. // This is true even in C++ for OpenCL. } else if (VDecl->getType().getAddressSpace() == LangAS::opencl_constant) { - CheckForConstantInitializer(Init, DclT); + CheckForConstantInitializer(Init); - // Otherwise, C++ does not restrict the initializer. + // Otherwise, C++ does not restrict the initializer. } else if (getLangOpts().CPlusPlus) { // do nothing // C99 6.7.8p4: All the expressions in an initializer for an object that has // static storage duration shall be constant expressions or string literals. } else if (VDecl->getStorageClass() == SC_Static) { - CheckForConstantInitializer(Init, DclT); + CheckForConstantInitializer(Init); - // C89 is stricter than C99 for aggregate initializers. - // C89 6.5.7p3: All the expressions [...] in an initializer list - // for an object that has aggregate or union type shall be - // constant expressions. + // C89 is stricter than C99 for aggregate initializers. + // C89 6.5.7p3: All the expressions [...] in an initializer list + // for an object that has aggregate or union type shall be + // constant expressions. } else if (!getLangOpts().C99 && VDecl->getType()->isAggregateType() && isa(Init)) { - const Expr *Culprit; - if (!Init->isConstantInitializer(Context, false, &Culprit)) { - Diag(Culprit->getExprLoc(), - diag::ext_aggregate_init_not_constant) - << Culprit->getSourceRange(); - } + CheckForConstantInitializer(Init, diag::ext_aggregate_init_not_constant); } if (auto *E = dyn_cast(Init)) @@ -13963,7 +13957,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { // Avoid duplicate diagnostics for constexpr variables. if (!getLangOpts().CPlusPlus && !VDecl->isInvalidDecl() && !VDecl->isConstexpr()) - CheckForConstantInitializer(Init, DclT); + CheckForConstantInitializer(Init); } QualType InitType = Init->getType(); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index cabffa47c93185..7c3faba0f78819 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -7331,7 +7331,7 @@ Sema::BuildCompoundLiteralExpr(SourceLocation LParenLoc, TypeSourceInfo *TInfo, if (!LiteralExpr->isTypeDependent() && !LiteralExpr->isValueDependent() && !literalType->isDependentType()) // C99 6.5.2.5p3 - if (CheckForConstantInitializer(LiteralExpr, literalType)) + if (CheckForConstantInitializer(LiteralExpr)) return ExprError(); } else if (literalType.getAddressSpace() != LangAS::opencl_private && literalType.getAddressSpace() != LangAS::Default) { diff --git a/clang/test/Sema/recover-expr-gh88008-nocrash.c b/clang/test/Sema/recover-expr-gh88008-nocrash.c new file mode 100644 index 00000000000000..5500b33dd0e85d --- /dev/null +++ b/clang/test/Sema/recover-expr-gh88008-nocrash.c @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 %s -verify -fsyntax-only -std=c90 + +struct S { + int v; +}; + +struct T; // expected-note {{forward declaration of 'struct T'}} + +void gh88008_nocrash(struct T *t) { + struct S s = { .v = t->y }; // expected-error {{incomplete definition of type 'struct T'}} +} From d2d4a1bbdc455a30d600743eb59fb1c69205967a Mon Sep 17 00:00:00 2001 From: XChy Date: Tue, 16 Apr 2024 22:52:19 +0800 Subject: [PATCH 112/300] Revert "[JumpThreading] Thread over BB with only an unconditional branch" (#88907) Reverts llvm/llvm-project#86312 --- llvm/lib/Transforms/Utils/Local.cpp | 10 +- llvm/test/CodeGen/AArch64/and-sink.ll | 9 +- .../AArch64/combine-comparisons-by-cse.ll | 122 ++++++++++------- llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll | 18 ++- llvm/test/Transforms/JumpThreading/pr79175.ll | 8 +- llvm/test/Transforms/JumpThreading/select.ll | 50 ++++--- .../Transforms/JumpThreading/thread-prob-7.ll | 8 +- .../Transforms/JumpThreading/uncond-no-phi.ll | 123 ------------------ .../PhaseOrdering/thread-uncond-bb.ll | 62 --------- 9 files changed, 126 insertions(+), 284 deletions(-) delete mode 100644 llvm/test/Transforms/JumpThreading/uncond-no-phi.ll delete mode 100644 llvm/test/Transforms/PhaseOrdering/thread-uncond-bb.ll diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index baec51a07fcbfc..a42ef0c4e6ae9e 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1019,14 +1019,12 @@ CanRedirectPredsOfEmptyBBToSucc(BasicBlock *BB, BasicBlock *Succ, const SmallPtrSetImpl &SuccPreds, BasicBlock *&CommonPred) { - // When Succ has no phis, BB may be merged into Succ directly. We don't need - // to redirect the predecessors of BB in this case. - if (Succ->phis().empty()) + // There must be phis in BB, otherwise BB will be merged into Succ directly + if (BB->phis().empty() || Succ->phis().empty()) return false; - // BB must have multiple different predecessors, so that at least one of - // predecessors can be redirected to Succ, except the common predecessor. - if (BB->getUniquePredecessor() || pred_empty(BB)) + // BB must have predecessors not shared that can be redirected to Succ + if (!BB->hasNPredecessorsOrMore(2)) return false; // Get single common predecessors of both BB and Succ diff --git a/llvm/test/CodeGen/AArch64/and-sink.ll b/llvm/test/CodeGen/AArch64/and-sink.ll index a57e9d54f3078e..f298a55dab721e 100644 --- a/llvm/test/CodeGen/AArch64/and-sink.ll +++ b/llvm/test/CodeGen/AArch64/and-sink.ll @@ -11,14 +11,15 @@ define dso_local i32 @and_sink1(i32 %a, i1 %c) { ; CHECK-LABEL: and_sink1: ; CHECK: // %bb.0: -; CHECK-NEXT: tbz w1, #0, .LBB0_2 +; CHECK-NEXT: tbz w1, #0, .LBB0_3 ; CHECK-NEXT: // %bb.1: // %bb0 -; CHECK-NEXT: tst w0, #0x4 ; CHECK-NEXT: adrp x8, A -; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: str wzr, [x8, :lo12:A] +; CHECK-NEXT: tbnz w0, #2, .LBB0_3 +; CHECK-NEXT: // %bb.2: +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: .LBB0_3: // %bb2 ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll index dde3e81833a63d..6449c3e11d6672 100644 --- a/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll +++ b/llvm/test/CodeGen/AArch64/combine-comparisons-by-cse.ll @@ -13,10 +13,10 @@ define i32 @combine_gt_ge_10() #0 { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, :got:a ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] -; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: ldr w8, [x8] +; CHECK-NEXT: cmp w8, #10 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] -; CHECK-NEXT: cmp w9, #10 ; CHECK-NEXT: b.le .LBB0_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: adrp x9, :got:c @@ -29,17 +29,18 @@ define i32 @combine_gt_ge_10() #0 { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_3: // %lor.lhs.false -; CHECK-NEXT: cmp w9, #10 -; CHECK-NEXT: b.lt .LBB0_5 +; CHECK-NEXT: b.lt .LBB0_6 ; CHECK-NEXT: .LBB0_4: // %land.lhs.true3 ; CHECK-NEXT: adrp x9, :got:d ; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: b.ne .LBB0_6 +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB0_5: +; CHECK-NEXT: .LBB0_6: // %if.end ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: @@ -144,10 +145,10 @@ define i32 @combine_lt_ge_5() #0 { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: adrp x8, :got:a ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] -; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: ldr w8, [x8] +; CHECK-NEXT: cmp w8, #5 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] -; CHECK-NEXT: cmp w9, #5 ; CHECK-NEXT: b.ge .LBB2_3 ; CHECK-NEXT: // %bb.1: // %land.lhs.true ; CHECK-NEXT: adrp x9, :got:c @@ -160,17 +161,18 @@ define i32 @combine_lt_ge_5() #0 { ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_3: // %lor.lhs.false -; CHECK-NEXT: cmp w9, #5 -; CHECK-NEXT: b.gt .LBB2_5 +; CHECK-NEXT: b.gt .LBB2_6 ; CHECK-NEXT: .LBB2_4: // %land.lhs.true3 ; CHECK-NEXT: adrp x9, :got:d ; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: b.ne .LBB2_6 +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_5: +; CHECK-NEXT: .LBB2_6: // %if.end ; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: @@ -497,17 +499,24 @@ define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 { ; CHECK-NEXT: // %bb.3: // %while.cond.while.end_crit_edge ; CHECK-NEXT: ldr w8, [x19] ; CHECK-NEXT: .LBB7_4: // %while.end -; CHECK-NEXT: adrp x9, :got:b -; CHECK-NEXT: adrp x10, :got:d -; CHECK-NEXT: ldr x9, [x9, :got_lo12:b] -; CHECK-NEXT: ldr x10, [x10, :got_lo12:d] -; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: b.gt .LBB7_7 +; CHECK-NEXT: // %bb.5: // %land.lhs.true +; CHECK-NEXT: adrp x8, :got:b +; CHECK-NEXT: adrp x9, :got:d +; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] +; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] +; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] -; CHECK-NEXT: ldr w10, [x10] -; CHECK-NEXT: cmp w9, w10 -; CHECK-NEXT: ccmp w8, #2, #0, eq -; CHECK-NEXT: mov w8, #123 // =0x7b -; CHECK-NEXT: csel w0, w8, wzr, lt +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: b.ne .LBB7_7 +; CHECK-NEXT: // %bb.6: +; CHECK-NEXT: mov w0, #123 // =0x7b +; CHECK-NEXT: b .LBB7_8 +; CHECK-NEXT: .LBB7_7: // %if.end +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: .LBB7_8: // %return +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w19 @@ -555,42 +564,52 @@ return: ; preds = %if.end, %land.lhs.t define i32 @do_nothing_if_compares_can_not_be_adjusted_to_each_other() #0 { ; CHECK-LABEL: do_nothing_if_compares_can_not_be_adjusted_to_each_other: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: .cfi_remember_state ; CHECK-NEXT: adrp x8, :got:a ; CHECK-NEXT: ldr x8, [x8, :got_lo12:a] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: b.gt .LBB8_4 +; CHECK-NEXT: b.gt .LBB8_3 ; CHECK-NEXT: // %bb.1: // %while.body.preheader -; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: sub w19, w8, #1 ; CHECK-NEXT: .LBB8_2: // %while.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: bl do_something ; CHECK-NEXT: adds w19, w19, #1 ; CHECK-NEXT: b.mi .LBB8_2 -; CHECK-NEXT: // %bb.3: -; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w19 -; CHECK-NEXT: .cfi_restore w30 -; CHECK-NEXT: .LBB8_4: // %while.end +; CHECK-NEXT: .LBB8_3: // %while.end +; CHECK-NEXT: adrp x8, :got:c +; CHECK-NEXT: ldr x8, [x8, :got_lo12:c] +; CHECK-NEXT: ldr w8, [x8] +; CHECK-NEXT: cmn w8, #2 +; CHECK-NEXT: b.lt .LBB8_6 +; CHECK-NEXT: // %bb.4: // %land.lhs.true ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d -; CHECK-NEXT: adrp x10, :got:c ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] ; CHECK-NEXT: ldr x9, [x9, :got_lo12:d] -; CHECK-NEXT: ldr x10, [x10, :got_lo12:c] ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] -; CHECK-NEXT: ldr w10, [x10] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: mov w8, #-3 // =0xfffffffd -; CHECK-NEXT: ccmp w10, w8, #4, eq -; CHECK-NEXT: mov w8, #123 // =0x7b -; CHECK-NEXT: csel w0, w8, wzr, gt +; CHECK-NEXT: b.ne .LBB8_6 +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: mov w0, #123 // =0x7b +; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB8_6: // %if.end +; CHECK-NEXT: .cfi_restore_state +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w30 ; CHECK-NEXT: ret entry: %0 = load i32, ptr @a, align 4 @@ -763,14 +782,12 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 { ; CHECK-NEXT: cmp w8, #0 ; CHECK-NEXT: csel x9, x0, xzr, gt ; CHECK-NEXT: str x9, [x1] -; CHECK-NEXT: b.le .LBB11_3 +; CHECK-NEXT: b.le .LBB11_2 ; CHECK-NEXT: // %bb.1: // %lor.lhs.false ; CHECK-NEXT: cmp w8, #2 -; CHECK-NEXT: b.ge .LBB11_5 -; CHECK-NEXT: // %bb.2: -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB11_3: // %land.lhs.true +; CHECK-NEXT: b.ge .LBB11_4 +; CHECK-NEXT: b .LBB11_6 +; CHECK-NEXT: .LBB11_2: // %land.lhs.true ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:c ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] @@ -778,11 +795,11 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: b.ne .LBB11_5 -; CHECK-NEXT: // %bb.4: +; CHECK-NEXT: b.ne .LBB11_4 +; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: mov w0, #1 // =0x1 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB11_5: // %land.lhs.true3 +; CHECK-NEXT: .LBB11_4: // %land.lhs.true3 ; CHECK-NEXT: adrp x8, :got:b ; CHECK-NEXT: adrp x9, :got:d ; CHECK-NEXT: ldr x8, [x8, :got_lo12:b] @@ -790,7 +807,12 @@ define i32 @combine_gt_ge_sel(i64 %v, ptr %p) #0 { ; CHECK-NEXT: ldr w8, [x8] ; CHECK-NEXT: ldr w9, [x9] ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: b.ne .LBB11_6 +; CHECK-NEXT: // %bb.5: +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB11_6: // %if.end +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ret entry: %0 = load i32, ptr @a, align 4 diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll index c33c81841be65e..dddc4bd953d7ac 100644 --- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll +++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll @@ -10,13 +10,12 @@ define i32 @fred(ptr %a0) #0 { ; CHECK-LABEL: fred: ; CHECK: // %bb.0: // %b0 ; CHECK-NEXT: { -; CHECK-NEXT: r1:0 = combine(r0,#0) -; CHECK-NEXT: if (p0) jumpr r31 +; CHECK-NEXT: if (p0) jump:nt .LBB0_2 ; CHECK-NEXT: } -; CHECK-NEXT: .LBB0_1: // %b2 +; CHECK-NEXT: // %bb.1: // %b2 ; CHECK-NEXT: { ; CHECK-NEXT: r3:2 = combine(#0,#0) -; CHECK-NEXT: r1:0 = memd(r1+#0) +; CHECK-NEXT: r1:0 = memd(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: p0 = vcmph.eq(r1:0,r3:2) @@ -28,7 +27,16 @@ define i32 @fred(ptr %a0) #0 { ; CHECK-NEXT: r0 = and(r0,#1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r0 = !cmp.eq(r0,#11) +; CHECK-NEXT: p0 = cmp.eq(r0,#11) +; CHECK-NEXT: r0 = #1 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: if (p0) r0 = #0 +; CHECK-NEXT: jumpr r31 +; CHECK-NEXT: } +; CHECK-NEXT: .LBB0_2: // %b14 +; CHECK-NEXT: { +; CHECK-NEXT: r0 = #0 ; CHECK-NEXT: jumpr r31 ; CHECK-NEXT: } b0: diff --git a/llvm/test/Transforms/JumpThreading/pr79175.ll b/llvm/test/Transforms/JumpThreading/pr79175.ll index cce30ce079999c..2c7ee0770cdc73 100644 --- a/llvm/test/Transforms/JumpThreading/pr79175.ll +++ b/llvm/test/Transforms/JumpThreading/pr79175.ll @@ -17,11 +17,11 @@ define i32 @test(i64 %idx, i32 %val) { ; CHECK: cond.end: ; CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[VAL]], 0 ; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[CMP_I]] -; CHECK-NEXT: br i1 [[COND_FR]], label [[TMP0:%.*]], label [[COND_END_THREAD]] -; CHECK: 0: -; CHECK-NEXT: br label [[COND_END_THREAD]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[COND_END_THREAD]], label [[TMP0:%.*]] ; CHECK: cond.end.thread: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[VAL]], [[COND_END]] ], [ 0, [[TMP0]] ], [ 0, [[FOR_BODY]] ] +; CHECK-NEXT: br label [[TMP0]] +; CHECK: 0: +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[COND_END_THREAD]] ], [ [[VAL]], [[COND_END]] ] ; CHECK-NEXT: [[F_IDX:%.*]] = getelementptr inbounds i32, ptr @f, i64 [[IDX]] ; CHECK-NEXT: store i32 [[TMP1]], ptr [[F_IDX]], align 4 ; CHECK-NEXT: [[F_RELOAD:%.*]] = load i32, ptr @f, align 4 diff --git a/llvm/test/Transforms/JumpThreading/select.ll b/llvm/test/Transforms/JumpThreading/select.ll index 27ebf4c25da509..4ec55a66bb8ac1 100644 --- a/llvm/test/Transforms/JumpThreading/select.ll +++ b/llvm/test/Transforms/JumpThreading/select.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt -S -passes="jump-threading" -debug-only=branch-prob < %s 2>&1 | FileCheck %s -; RUN: opt -S -passes="require,jump-threading" -debug-only=branch-prob -disable-output < %s 2>&1 | FileCheck -check-prefix=CHECK-BPI %s +; RUN: opt -S -passes="require,jump-threading" -debug-only=branch-prob < %s 2>&1 | FileCheck -check-prefixes=CHECK,CHECK-BPI %s ; REQUIRES: asserts ; CHECK-BPI-LABEL: ---- Branch Probability Info : unfold1 ---- @@ -21,7 +21,7 @@ declare void @quux() ; booleans where at least one operand is true/false/undef. ;. -; CHECK: @anchor = constant [3 x ptr] [ptr blockaddress(@test_indirectbr, %L1), ptr inttoptr (i32 1 to ptr), ptr blockaddress(@test_indirectbr, %L3)] +; CHECK: @[[ANCHOR:[a-zA-Z0-9_$"\\.-]+]] = constant [3 x ptr] [ptr blockaddress(@test_indirectbr, [[L1:%.*]]), ptr inttoptr (i32 1 to ptr), ptr blockaddress(@test_indirectbr, [[L3:%.*]])] ;. define void @test_br(i1 %cond, i1 %value) nounwind { ; CHECK-LABEL: @test_br( @@ -66,8 +66,8 @@ define void @test_switch(i1 %cond, i8 %value) nounwind { ; CHECK-NEXT: call void @quux() ; CHECK-NEXT: [[EXPR:%.*]] = select i1 [[COND]], i8 1, i8 [[VALUE:%.*]] ; CHECK-NEXT: switch i8 [[EXPR]], label [[L3:%.*]] [ -; CHECK-NEXT: i8 1, label [[L1]] -; CHECK-NEXT: i8 2, label [[L2:%.*]] +; CHECK-NEXT: i8 1, label [[L1]] +; CHECK-NEXT: i8 2, label [[L2:%.*]] ; CHECK-NEXT: ] ; CHECK: L1: ; CHECK-NEXT: call void @foo() @@ -192,8 +192,8 @@ define void @test_switch_cmp(i1 %cond, i32 %val, i8 %value) nounwind { ; CHECK: 0: ; CHECK-NEXT: [[TMP1:%.*]] = phi i8 [ [[VALUE:%.*]], [[L0]] ] ; CHECK-NEXT: switch i8 [[TMP1]], label [[L3:%.*]] [ -; CHECK-NEXT: i8 1, label [[L1]] -; CHECK-NEXT: i8 2, label [[L2:%.*]] +; CHECK-NEXT: i8 1, label [[L1]] +; CHECK-NEXT: i8 2, label [[L2:%.*]] ; CHECK-NEXT: ] ; CHECK: L1: ; CHECK-NEXT: call void @foo() @@ -237,8 +237,8 @@ define void @test_switch_default(ptr nocapture %status) nounwind { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[STATUS:%.*]], align 4 ; CHECK-NEXT: switch i32 [[TMP0]], label [[L2:%.*]] [ -; CHECK-NEXT: i32 5061, label [[L2_THREAD:%.*]] -; CHECK-NEXT: i32 0, label [[L2]] +; CHECK-NEXT: i32 5061, label [[L2_THREAD:%.*]] +; CHECK-NEXT: i32 0, label [[L2]] ; CHECK-NEXT: ] ; CHECK: L2.thread: ; CHECK-NEXT: store i32 10025, ptr [[STATUS]], align 4 @@ -377,21 +377,21 @@ define i32 @unfold3(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) noun ; CHECK-NEXT: br i1 [[CMP_I]], label [[DOTEXIT_THREAD4:%.*]], label [[COND_FALSE_I:%.*]] ; CHECK: cond.false.i: ; CHECK-NEXT: [[CMP4_I:%.*]] = icmp sgt i32 [[U]], [[V]] -; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD4]], label [[COND_FALSE_6_I:%.*]] +; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD:%.*]], label [[COND_FALSE_6_I:%.*]] ; CHECK: cond.false.6.i: ; CHECK-NEXT: [[CMP8_I:%.*]] = icmp slt i32 [[W:%.*]], [[X:%.*]] ; CHECK-NEXT: br i1 [[CMP8_I]], label [[DOTEXIT_THREAD4]], label [[COND_FALSE_10_I:%.*]] ; CHECK: cond.false.10.i: ; CHECK-NEXT: [[CMP13_I:%.*]] = icmp sgt i32 [[W]], [[X]] -; CHECK-NEXT: br i1 [[CMP13_I]], label [[DOTEXIT_THREAD4]], label [[DOTEXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP13_I]], label [[DOTEXIT_THREAD]], label [[DOTEXIT:%.*]] ; CHECK: .exit: ; CHECK-NEXT: [[PHITMP:%.*]] = icmp sge i32 [[Y:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[PHITMP]] -; CHECK-NEXT: br i1 [[COND_FR]], label [[DOTEXIT_THREAD:%.*]], label [[DOTEXIT_THREAD4]] -; CHECK: 0: -; CHECK-NEXT: br label [[DOTEXIT_THREAD4]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[DOTEXIT_THREAD]], label [[DOTEXIT_THREAD4]] ; CHECK: .exit.thread: -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[ADD3]], [[DOTEXIT]] ], [ [[J]], [[DOTEXIT_THREAD]] ], [ [[J]], [[COND_FALSE_I]] ], [ [[J]], [[COND_FALSE_10_I]] ], [ [[ADD3]], [[ENTRY:%.*]] ], [ [[ADD3]], [[COND_FALSE_6_I]] ] +; CHECK-NEXT: br label [[DOTEXIT_THREAD4]] +; CHECK: .exit.thread4: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[J]], [[DOTEXIT_THREAD]] ], [ [[ADD3]], [[DOTEXIT]] ], [ [[ADD3]], [[ENTRY:%.*]] ], [ [[ADD3]], [[COND_FALSE_6_I]] ] ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: @@ -430,23 +430,23 @@ define i32 @unfold4(i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z, i32 %j) noun ; CHECK-NEXT: br i1 [[CMP_I]], label [[DOTEXIT_THREAD:%.*]], label [[COND_FALSE_I:%.*]] ; CHECK: cond.false.i: ; CHECK-NEXT: [[CMP4_I:%.*]] = icmp sgt i32 [[U]], [[V]] -; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD]], label [[COND_FALSE_6_I:%.*]] +; CHECK-NEXT: br i1 [[CMP4_I]], label [[DOTEXIT_THREAD5:%.*]], label [[COND_FALSE_6_I:%.*]] ; CHECK: cond.false.6.i: ; CHECK-NEXT: [[CMP8_I:%.*]] = icmp slt i32 [[W:%.*]], [[X:%.*]] ; CHECK-NEXT: br i1 [[CMP8_I]], label [[DOTEXIT_THREAD]], label [[COND_FALSE_10_I:%.*]] ; CHECK: cond.false.10.i: ; CHECK-NEXT: [[CMP13_I:%.*]] = icmp sgt i32 [[W]], [[X]] -; CHECK-NEXT: br i1 [[CMP13_I]], label [[DOTEXIT_THREAD]], label [[DOTEXIT:%.*]] +; CHECK-NEXT: br i1 [[CMP13_I]], label [[DOTEXIT_THREAD5]], label [[DOTEXIT:%.*]] ; CHECK: .exit: ; CHECK-NEXT: [[CMP19_I:%.*]] = icmp sge i32 [[Y:%.*]], [[Z:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP19_I]] to i32 ; CHECK-NEXT: [[LNOT_I18:%.*]] = icmp eq i32 [[CONV]], 1 ; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[LNOT_I18]] -; CHECK-NEXT: br i1 [[COND_FR]], label [[TMP1:%.*]], label [[DOTEXIT_THREAD]] -; CHECK: 0: -; CHECK-NEXT: br label [[DOTEXIT_THREAD]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[DOTEXIT_THREAD]], label [[DOTEXIT_THREAD5]] ; CHECK: .exit.thread: -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[ADD3]], [[DOTEXIT]] ], [ [[J]], [[TMP1]] ], [ [[J]], [[ENTRY:%.*]] ], [ [[J]], [[COND_FALSE_6_I]] ], [ [[ADD3]], [[COND_FALSE_I]] ], [ [[ADD3]], [[COND_FALSE_10_I]] ] +; CHECK-NEXT: br label [[DOTEXIT_THREAD5]] +; CHECK: .exit.thread5: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[J]], [[DOTEXIT_THREAD]] ], [ [[ADD3]], [[DOTEXIT]] ], [ [[ADD3]], [[COND_FALSE_I]] ], [ [[ADD3]], [[COND_FALSE_10_I]] ] ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: @@ -560,10 +560,10 @@ define void @test_func(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr ; CHECK: if.end: ; CHECK-NEXT: [[LOCAL_VAR_0:%.*]] = phi i32 [ [[TMP1]], [[FOR_BODY]] ] ; CHECK-NEXT: switch i32 [[LOCAL_VAR_0]], label [[SW_DEFAULT]] [ -; CHECK-NEXT: i32 2, label [[SW_BB]] -; CHECK-NEXT: i32 4, label [[SW_BB7]] -; CHECK-NEXT: i32 5, label [[SW_BB8:%.*]] -; CHECK-NEXT: i32 7, label [[SW_BB9:%.*]] +; CHECK-NEXT: i32 2, label [[SW_BB]] +; CHECK-NEXT: i32 4, label [[SW_BB7]] +; CHECK-NEXT: i32 5, label [[SW_BB8:%.*]] +; CHECK-NEXT: i32 7, label [[SW_BB9:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb: ; CHECK-NEXT: call void @foo() @@ -674,5 +674,3 @@ if.end: ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1984} ; CHECK: [[PROF1]] = !{!"branch_weights", i64 1073741824, i64 3221225472} ;. -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-BPI: {{.*}} diff --git a/llvm/test/Transforms/JumpThreading/thread-prob-7.ll b/llvm/test/Transforms/JumpThreading/thread-prob-7.ll index 4623a579be48f6..8c9d89871d00b3 100644 --- a/llvm/test/Transforms/JumpThreading/thread-prob-7.ll +++ b/llvm/test/Transforms/JumpThreading/thread-prob-7.ll @@ -14,15 +14,15 @@ define i32 @func0(i32 %a0, i32 %a1) !prof !0 { ; CHECK-NEXT: br i1 [[CMP1]], label [[BB_JOIN_THREAD:%.*]], label [[TEST2_FALSE:%.*]], !prof [[PROF2:![0-9]+]] ; CHECK: test2_false: ; CHECK-NEXT: call void @foobar() -; CHECK-NEXT: br label [[BB_JOIN_THREAD]] +; CHECK-NEXT: br label [[TMP0:%.*]] ; CHECK: bb_join: ; CHECK-NEXT: [[C:%.*]] = phi i1 [ [[CX]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[C]] -; CHECK-NEXT: br i1 [[COND_FR]], label [[BB_JOIN_THREAD1:%.*]], label [[BB_JOIN_THREAD]], !prof [[PROF3:![0-9]+]] +; CHECK-NEXT: br i1 [[COND_FR]], label [[BB_JOIN_THREAD]], label [[TMP0]], !prof [[PROF3:![0-9]+]] ; CHECK: bb_join.thread: -; CHECK-NEXT: br label [[BB_JOIN_THREAD]] +; CHECK-NEXT: br label [[TMP0]] ; CHECK: 0: -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 7, [[BB_JOIN]] ], [ 7, [[TEST2_FALSE]] ], [ 42, [[TEST2]] ], [ 42, [[BB_JOIN_THREAD1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 42, [[BB_JOIN_THREAD]] ], [ 7, [[BB_JOIN]] ], [ 7, [[TEST2_FALSE]] ] ; CHECK-NEXT: ret i32 [[TMP1]] ; entry: diff --git a/llvm/test/Transforms/JumpThreading/uncond-no-phi.ll b/llvm/test/Transforms/JumpThreading/uncond-no-phi.ll deleted file mode 100644 index 6104e8f8778bc0..00000000000000 --- a/llvm/test/Transforms/JumpThreading/uncond-no-phi.ll +++ /dev/null @@ -1,123 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes=jump-threading -S < %s | FileCheck %s - -define i1 @if_else(i1 %c, i1 %c1) { -; CHECK-LABEL: define i1 @if_else( -; CHECK-SAME: i1 [[C:%.*]], i1 [[C1:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[C]], label [[THEN:%.*]], label [[RETURN:%.*]] -; CHECK: then: -; CHECK-NEXT: call void @dummy() -; CHECK-NEXT: br i1 [[C1]], label [[ELSE:%.*]], label [[RETURN]] -; CHECK: else: -; CHECK-NEXT: br label [[RETURN]] -; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i1 [ false, [[THEN]] ], [ true, [[ENTRY:%.*]] ], [ true, [[ELSE]] ] -; CHECK-NEXT: ret i1 [[RETVAL_0]] -; -entry: - br i1 %c, label %then, label %else - -then: - call void @dummy() - br i1 %c1, label %else, label %return - -else: - br label %return - -return: - %retval.0 = phi i1 [ true, %else ], [ false, %then ] - ret i1 %retval.0 -} - -define i8 @switch_uncond(i8 %arg) { -; CHECK-LABEL: define i8 @switch_uncond( -; CHECK-SAME: i8 [[ARG:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: switch i8 [[ARG]], label [[DEFAULT:%.*]] [ -; CHECK-NEXT: i8 0, label [[BB1:%.*]] -; CHECK-NEXT: i8 1, label [[BB3:%.*]] -; CHECK-NEXT: i8 2, label [[BB2:%.*]] -; CHECK-NEXT: i8 3, label [[END:%.*]] -; CHECK-NEXT: ] -; CHECK: default: -; CHECK-NEXT: unreachable -; CHECK: bb: -; CHECK-NEXT: call void @dummy() -; CHECK-NEXT: br label [[END]] -; CHECK: bb1: -; CHECK-NEXT: call void @dummy() -; CHECK-NEXT: br label [[END]] -; CHECK: bb2: -; CHECK-NEXT: br label [[END]] -; CHECK: end: -; CHECK-NEXT: [[PHI:%.*]] = phi i8 [ 1, [[ENTRY:%.*]] ], [ 0, [[BB3]] ], [ 0, [[BB1]] ], [ 0, [[BB2]] ] -; CHECK-NEXT: ret i8 [[PHI]] -; -entry: - switch i8 %arg, label %default [ - i8 0, label %bb - i8 1, label %bb1 - i8 2, label %bb2 - i8 3, label %end - ] - -default: - unreachable - -bb: - call void @dummy() - br label %bb2 - -bb1: - call void @dummy() - br label %bb2 - -; Predecessors of %bb2 are %bb and %bb1, they are not identical. -; So we can thread %bb2. -bb2: - br label %end - -end: - %phi = phi i8 [ 0, %bb2 ], [ 1, %entry ] - ret i8 %phi -} - -define i8 @switch_uncond_fail(i8 %arg) { -; CHECK-LABEL: define i8 @switch_uncond_fail( -; CHECK-SAME: i8 [[ARG:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: switch i8 [[ARG]], label [[DEFAULT:%.*]] [ -; CHECK-NEXT: i8 0, label [[BB:%.*]] -; CHECK-NEXT: i8 1, label [[BB]] -; CHECK-NEXT: i8 2, label [[END:%.*]] -; CHECK-NEXT: ] -; CHECK: default: -; CHECK-NEXT: br label [[END]] -; CHECK: bb: -; CHECK-NEXT: br label [[END]] -; CHECK: end: -; CHECK-NEXT: [[PHI:%.*]] = phi i8 [ 0, [[BB]] ], [ 1, [[ENTRY:%.*]] ], [ 2, [[DEFAULT]] ] -; CHECK-NEXT: ret i8 [[PHI]] -; -entry: - switch i8 %arg, label %default [ - i8 0, label %bb - i8 1, label %bb - i8 2, label %end - ] - -default: - br label %end - -; Predecessor of %bb is only %entry (though there are two in predecessor list), -; thus it's unthreadable. -bb: - br label %end - -end: - %phi = phi i8 [ 0, %bb ], [ 1, %entry ], [ 2, %default ] - ret i8 %phi -} - -declare void @dummy() diff --git a/llvm/test/Transforms/PhaseOrdering/thread-uncond-bb.ll b/llvm/test/Transforms/PhaseOrdering/thread-uncond-bb.ll deleted file mode 100644 index 17146d7d5987fc..00000000000000 --- a/llvm/test/Transforms/PhaseOrdering/thread-uncond-bb.ll +++ /dev/null @@ -1,62 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt < %s -O3 -S | FileCheck %s - -define i32 @thread_uncond_bb_cmp(i1 %c, i32 %v) { -; CHECK-LABEL: define i32 @thread_uncond_bb_cmp( -; CHECK-SAME: i1 [[C:%.*]], i32 [[V:%.*]]) local_unnamed_addr { -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[C]], label [[DO_END:%.*]], label [[IF_THEN:%.*]] -; CHECK: if.then: -; CHECK-NEXT: tail call void @dummy() -; CHECK-NEXT: br label [[DO_END]] -; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[V]], [[IF_THEN]] ] -; CHECK-NEXT: ret i32 [[RETVAL]] -; -entry: - br i1 %c, label %do.end, label %if.then - -if.then: ; preds = %entry - call void @dummy() - %tobool = icmp eq i32 %v, 0 - br i1 %tobool, label %do.end, label %return - -do.end: ; preds = %entry, %if.then - br label %return - -return: ; preds = %if.then, %do.end - %retval = phi i32 [ 0, %do.end ], [ %v, %if.then ] - ret i32 %retval -} - -define i32 @thread_uncond_bb_cmp_zext(i1 %c, i32 %v) { -; CHECK-LABEL: define i32 @thread_uncond_bb_cmp_zext( -; CHECK-SAME: i1 [[C:%.*]], i32 [[V:%.*]]) local_unnamed_addr { -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[C]], label [[DO_END:%.*]], label [[IF_THEN:%.*]] -; CHECK: if.then: -; CHECK-NEXT: tail call void @dummy() -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[V]], 0 -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[TOBOOL]] to i32 -; CHECK-NEXT: br label [[DO_END]] -; CHECK: return: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] -; CHECK-NEXT: ret i32 [[RETVAL]] -; -entry: - br i1 %c, label %do.end, label %if.then - -if.then: ; preds = %entry - call void @dummy() - %tobool = icmp eq i32 %v, 0 - br i1 %tobool, label %do.end, label %return - -do.end: ; preds = %entry, %if.then - br label %return - -return: ; preds = %if.then, %do.end - %retval = phi i32 [ 0, %do.end ], [ 1, %if.then ] - ret i32 %retval -} - -declare void @dummy() From 22629bb22a1bea95eebfc9b3171005de107c38f1 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 16 Apr 2024 10:54:28 -0400 Subject: [PATCH 113/300] [libc++] Use availability to rely on key functions for bad_expected_access and bad_function_call (#87390) This patch uses our availability machinery to allow defining a key function for bad_function_call and bad_expected_access at all times but only rely on it when we can. This prevents compilers from complaining about weak vtables and reduces code bloat and the amount of work done by the dynamic linker. rdar://111917845 --- libcxx/include/__availability | 16 ++++++++++++ libcxx/include/__config | 26 ++++--------------- .../include/__expected/bad_expected_access.h | 13 ++++++---- libcxx/include/__functional/function.h | 5 +++- ...bcxxabi.v1.stable.exceptions.nonew.abilist | 4 +++ ...bcxxabi.v1.stable.exceptions.nonew.abilist | 4 +++ ...bcxxabi.v1.stable.exceptions.nonew.abilist | 4 +++ ...bcxxabi.v1.stable.exceptions.nonew.abilist | 4 +++ ...bcxxabi.v1.stable.exceptions.nonew.abilist | 4 +++ ...bcxxabi.v1.stable.exceptions.nonew.abilist | 4 +++ ...bcxxabi.v1.stable.exceptions.nonew.abilist | 4 +++ ...bcxxabi.v1.stable.exceptions.nonew.abilist | 4 +++ ...xxabi.v1.stable.noexceptions.nonew.abilist | 4 +++ libcxx/src/CMakeLists.txt | 1 + libcxx/src/expected.cpp | 13 ++++++++++ libcxx/src/functional.cpp | 2 -- 16 files changed, 83 insertions(+), 29 deletions(-) create mode 100644 libcxx/src/expected.cpp diff --git a/libcxx/include/__availability b/libcxx/include/__availability index bb3ed0a8da521b..aa761eb5bfe5e3 100644 --- a/libcxx/include/__availability +++ b/libcxx/include/__availability @@ -160,6 +160,15 @@ # define _LIBCPP_AVAILABILITY_HAS_TZDB 1 # define _LIBCPP_AVAILABILITY_TZDB +// These macros determine whether we assume that std::bad_function_call and +// std::bad_expected_access provide a key function in the dylib. This allows +// centralizing their vtable and typeinfo instead of having all TUs provide +// a weak definition that then gets deduplicated. +# define _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION 1 +# define _LIBCPP_AVAILABILITY_BAD_FUNCTION_CALL_KEY_FUNCTION +# define _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION 1 +# define _LIBCPP_AVAILABILITY_BAD_EXPECTED_ACCESS_KEY_FUNCTION + #elif defined(__APPLE__) # define _LIBCPP_AVAILABILITY_HAS_BAD_OPTIONAL_ACCESS \ @@ -290,6 +299,13 @@ # else # define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 1 # endif + +# define _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION 0 +# define _LIBCPP_AVAILABILITY_BAD_FUNCTION_CALL_KEY_FUNCTION __attribute__((unavailable)) + +# define _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION 0 +# define _LIBCPP_AVAILABILITY_BAD_EXPECTED_ACCESS_KEY_FUNCTION __attribute__((unavailable)) + #else // ...New vendors can add availability markup here... diff --git a/libcxx/include/__config b/libcxx/include/__config index 82782b31c557b1..e9fda9cd24ebbe 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -120,14 +120,11 @@ # define _LIBCPP_ABI_FIX_UNORDERED_NODE_POINTER_UB # define _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB # define _LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE -// Define a key function for `bad_function_call` in the library, to centralize -// its vtable and typeinfo to libc++ rather than having all other libraries -// using that class define their own copies. -# define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION -// Override the default return value of exception::what() for -// bad_function_call::what() with a string that is specific to -// bad_function_call (see http://wg21.link/LWG2233). This is an ABI break -// because it changes the vtable layout of bad_function_call. +// Override the default return value of exception::what() for bad_function_call::what() +// with a string that is specific to bad_function_call (see http://wg21.link/LWG2233). +// This is an ABI break on platforms that sign and authenticate vtable function pointers +// because it changes the mangling of the virtual function located in the vtable, which +// changes how it gets signed. # define _LIBCPP_ABI_BAD_FUNCTION_CALL_GOOD_WHAT_MESSAGE // Enable optimized version of __do_get_(un)signed which avoids redundant copies. # define _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET @@ -197,19 +194,6 @@ # if defined(__FreeBSD__) && __FreeBSD__ < 14 # define _LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR # endif -// For XCOFF linkers, we have problems if we see a weak hidden version of a symbol -// in user code (like you get with -fvisibility-inlines-hidden) and then a strong def -// in the library, so we need to always rely on the library version. -# if defined(_AIX) -# define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION -# endif -# endif - -# if defined(_LIBCPP_BUILDING_LIBRARY) || _LIBCPP_ABI_VERSION >= 2 -// Define a key function for `bad_function_call` in the library, to centralize -// its vtable and typeinfo to libc++ rather than having all other libraries -// using that class define their own copies. -# define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION # endif // We had some bugs where we use [[no_unique_address]] together with construct_at, diff --git a/libcxx/include/__expected/bad_expected_access.h b/libcxx/include/__expected/bad_expected_access.h index 9d490307b68081..ef29fa50883136 100644 --- a/libcxx/include/__expected/bad_expected_access.h +++ b/libcxx/include/__expected/bad_expected_access.h @@ -9,6 +9,7 @@ #ifndef _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H #define _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H +#include <__availability> #include <__config> #include <__exception/exception.h> #include <__utility/move.h> @@ -28,9 +29,11 @@ template class bad_expected_access; _LIBCPP_DIAGNOSTIC_PUSH +# if !_LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wweak-vtables") +# endif template <> -class bad_expected_access : public exception { +class _LIBCPP_EXPORTED_FROM_ABI bad_expected_access : public exception { protected: _LIBCPP_HIDE_FROM_ABI bad_expected_access() noexcept = default; _LIBCPP_HIDE_FROM_ABI bad_expected_access(const bad_expected_access&) noexcept = default; @@ -40,11 +43,11 @@ class bad_expected_access : public exception { _LIBCPP_HIDE_FROM_ABI_VIRTUAL ~bad_expected_access() override = default; public: - // The way this has been designed (by using a class template below) means that we'll already - // have a profusion of these vtables in TUs, and the dynamic linker will already have a bunch - // of work to do. So it is not worth hiding the specialization in the dylib, given that - // it adds deployment target restrictions. +# if _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION + const char* what() const noexcept override; +# else _LIBCPP_HIDE_FROM_ABI_VIRTUAL const char* what() const noexcept override { return "bad access to std::expected"; } +# endif }; _LIBCPP_DIAGNOSTIC_POP diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h index 1faa9e92ebd63e..36057706933d43 100644 --- a/libcxx/include/__functional/function.h +++ b/libcxx/include/__functional/function.h @@ -11,6 +11,7 @@ #define _LIBCPP___FUNCTIONAL_FUNCTION_H #include <__assert> +#include <__availability> #include <__config> #include <__exception/exception.h> #include <__functional/binary_function.h> @@ -55,7 +56,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD // bad_function_call _LIBCPP_DIAGNOSTIC_PUSH +# if !_LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wweak-vtables") +# endif class _LIBCPP_EXPORTED_FROM_ABI bad_function_call : public exception { public: _LIBCPP_HIDE_FROM_ABI bad_function_call() _NOEXCEPT = default; @@ -64,7 +67,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_function_call : public exception { // Note that when a key function is not used, every translation unit that uses // bad_function_call will end up containing a weak definition of the vtable and // typeinfo. -# ifdef _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION +# if _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION ~bad_function_call() _NOEXCEPT override; # else _LIBCPP_HIDE_FROM_ABI_VIRTUAL ~bad_function_call() _NOEXCEPT override {} diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist index 46353986f5d7d7..64cf368e6e6849 100644 --- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -575,6 +575,7 @@ {'is_defined': True, 'name': '__ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'} +{'is_defined': True, 'name': '__ZNKSt3__119bad_expected_accessIvE4whatEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftER11__mbstate_tPcS4_RS4_', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'} @@ -2073,6 +2074,7 @@ {'is_defined': True, 'name': '__ZTINSt3__117moneypunct_bynameIwLb1EEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTINSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTINSt3__119__shared_weak_countE', 'size': 0, 'type': 'OBJECT'} +{'is_defined': True, 'name': '__ZTINSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTINSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTINSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'size': 0, 'type': 'OBJECT'} @@ -2264,6 +2266,7 @@ {'is_defined': True, 'name': '__ZTSNSt3__117moneypunct_bynameIwLb0EEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTSNSt3__117moneypunct_bynameIwLb1EEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTSNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} +{'is_defined': True, 'name': '__ZTSNSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTSNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTSNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTSNSt3__13pmr15memory_resourceE', 'size': 0, 'type': 'OBJECT'} @@ -2482,6 +2485,7 @@ {'is_defined': True, 'name': '__ZTVNSt3__117moneypunct_bynameIwLb1EEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTVNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTVNSt3__119__shared_weak_countE', 'size': 0, 'type': 'OBJECT'} +{'is_defined': True, 'name': '__ZTVNSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTVNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 0, 'type': 'OBJECT'} diff --git a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist index fec3a4505a0c6d..8751dffe230259 100644 --- a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -209,6 +209,7 @@ {'is_defined': True, 'name': '_ZNKSt6__ndk118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt6__ndk118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt6__ndk119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNKSt6__ndk119bad_expected_accessIvE4whatEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE10do_unshiftER9mbstate_tPcS4_RS4_', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'} @@ -1722,6 +1723,7 @@ {'is_defined': True, 'name': '_ZTINSt6__ndk118__time_get_storageIwEE', 'size': 12, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 12, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt6__ndk119__shared_weak_countE', 'size': 24, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTINSt6__ndk119bad_expected_accessIvEE', 'size': 12, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 12, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 12, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 12, 'type': 'OBJECT'} @@ -1958,6 +1960,7 @@ {'is_defined': True, 'name': '_ZTSNSt6__ndk118__time_get_storageIwEE', 'size': 35, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 72, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt6__ndk119__shared_weak_countE', 'size': 33, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTSNSt6__ndk119bad_expected_accessIvEE', 'size': 36, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 73, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 73, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 38, 'type': 'OBJECT'} @@ -2188,6 +2191,7 @@ {'is_defined': True, 'name': '_ZTVNSt6__ndk117moneypunct_bynameIwLb1EEE', 'size': 56, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 60, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt6__ndk119__shared_weak_countE', 'size': 28, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTVNSt6__ndk119bad_expected_accessIvEE', 'size': 20, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 40, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 40, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 48, 'type': 'OBJECT'} diff --git a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist index e52cf98dd4c4f1..7e223e66528847 100644 --- a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -99,6 +99,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'storage_mapping_class': 'DS', 'type': 'FUNC'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__119bad_expected_accessIvE4whatEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftERPcS2_S2_S3_', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} @@ -910,6 +911,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__117bad_function_callE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__119__shared_weak_countE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} @@ -969,6 +971,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__117bad_function_callE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__119__shared_weak_countE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} @@ -1031,6 +1034,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__117bad_function_callE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__119__shared_weak_countE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} diff --git a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist index 52a04706ddf20b..407d0456757af2 100644 --- a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -99,6 +99,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'storage_mapping_class': 'DS', 'type': 'FUNC'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__119bad_expected_accessIvE4whatEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftERPcS2_S2_S3_', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'} @@ -910,6 +911,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__117bad_function_callE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__119__shared_weak_countE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} @@ -969,6 +971,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__117bad_function_callE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__119__shared_weak_countE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'} @@ -1031,6 +1034,7 @@ {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__117bad_function_callE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__119__shared_weak_countE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} +{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'} diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist index c169b4a9925219..d578b41383c0e3 100644 --- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -575,6 +575,7 @@ {'is_defined': True, 'name': '__ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'} +{'is_defined': True, 'name': '__ZNKSt3__119bad_expected_accessIvE4whatEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftER11__mbstate_tPcS4_RS4_', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'} {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'} @@ -2087,6 +2088,7 @@ {'is_defined': True, 'name': '__ZTINSt3__118__time_get_storageIwEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTINSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTINSt3__119__shared_weak_countE', 'size': 0, 'type': 'OBJECT'} +{'is_defined': True, 'name': '__ZTINSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTINSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTINSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'size': 0, 'type': 'OBJECT'} @@ -2291,6 +2293,7 @@ {'is_defined': True, 'name': '__ZTSNSt3__117moneypunct_bynameIwLb0EEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTSNSt3__117moneypunct_bynameIwLb1EEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTSNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} +{'is_defined': True, 'name': '__ZTSNSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTSNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTSNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTSNSt3__13pmr15memory_resourceE', 'size': 0, 'type': 'OBJECT'} @@ -2516,6 +2519,7 @@ {'is_defined': True, 'name': '__ZTVNSt3__117moneypunct_bynameIwLb1EEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTVNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTVNSt3__119__shared_weak_countE', 'size': 0, 'type': 'OBJECT'} +{'is_defined': True, 'name': '__ZTVNSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTVNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'} {'is_defined': True, 'name': '__ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 0, 'type': 'OBJECT'} diff --git a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist index efa2189e9c9287..fc0f4fcf415e63 100644 --- a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -209,6 +209,7 @@ {'is_defined': True, 'name': '_ZNKSt6__ndk118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt6__ndk118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt6__ndk119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNKSt6__ndk119bad_expected_accessIvE4whatEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE10do_unshiftER9mbstate_tPcS4_RS4_', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'} @@ -1722,6 +1723,7 @@ {'is_defined': True, 'name': '_ZTINSt6__ndk118__time_get_storageIwEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt6__ndk119__shared_weak_countE', 'size': 40, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTINSt6__ndk119bad_expected_accessIvEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 24, 'type': 'OBJECT'} @@ -1955,6 +1957,7 @@ {'is_defined': True, 'name': '_ZTSNSt6__ndk118__time_get_storageIwEE', 'size': 35, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 72, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt6__ndk119__shared_weak_countE', 'size': 33, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTSNSt6__ndk119bad_expected_accessIvEE', 'size': 36, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 73, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 73, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 38, 'type': 'OBJECT'} @@ -2182,6 +2185,7 @@ {'is_defined': True, 'name': '_ZTVNSt6__ndk117moneypunct_bynameIwLb1EEE', 'size': 112, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 120, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt6__ndk119__shared_weak_countE', 'size': 56, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTVNSt6__ndk119bad_expected_accessIvEE', 'size': 40, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 96, 'type': 'OBJECT'} diff --git a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist index ebda5b0dfba57d..4022339562b3ad 100644 --- a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -266,6 +266,7 @@ {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNKSt3__119bad_expected_accessIvE4whatEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftER11__mbstate_tPcS4_RS4_', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'} @@ -1695,6 +1696,7 @@ {'is_defined': True, 'name': '_ZTINSt3__118__time_get_storageIwEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__119__shared_weak_countE', 'size': 40, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTINSt3__119bad_expected_accessIvEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'size': 24, 'type': 'OBJECT'} @@ -1829,6 +1831,7 @@ {'is_defined': True, 'name': '_ZTSNSt3__118__time_get_storageIwEE', 'size': 32, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 69, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__119__shared_weak_countE', 'size': 30, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTSNSt3__119bad_expected_accessIvEE', 'size': 33, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 35, 'type': 'OBJECT'} @@ -1962,6 +1965,7 @@ {'is_defined': True, 'name': '_ZTVNSt3__117moneypunct_bynameIwLb1EEE', 'size': 112, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 120, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__119__shared_weak_countE', 'size': 56, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTVNSt3__119bad_expected_accessIvEE', 'size': 40, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 96, 'type': 'OBJECT'} diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist index 6432ad3be35859..574c4504c59b8e 100644 --- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist @@ -264,6 +264,7 @@ {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNKSt3__119bad_expected_accessIvE4whatEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftER11__mbstate_tPcS4_RS4_', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'} @@ -1696,6 +1697,7 @@ {'is_defined': True, 'name': '_ZTINSt3__118__time_get_storageIwEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__119__shared_weak_countE', 'size': 40, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTINSt3__119bad_expected_accessIvEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'size': 24, 'type': 'OBJECT'} @@ -1830,6 +1832,7 @@ {'is_defined': True, 'name': '_ZTSNSt3__118__time_get_storageIwEE', 'size': 32, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 69, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__119__shared_weak_countE', 'size': 30, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTSNSt3__119bad_expected_accessIvEE', 'size': 33, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 35, 'type': 'OBJECT'} @@ -1963,6 +1966,7 @@ {'is_defined': True, 'name': '_ZTVNSt3__117moneypunct_bynameIwLb1EEE', 'size': 112, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 120, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__119__shared_weak_countE', 'size': 56, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTVNSt3__119bad_expected_accessIvEE', 'size': 40, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 96, 'type': 'OBJECT'} diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist index 1fe84e17b3f7f0..665546699e8ded 100644 --- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist +++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist @@ -235,6 +235,7 @@ {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'} +{'is_defined': True, 'name': '_ZNKSt3__119bad_expected_accessIvE4whatEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftER11__mbstate_tPcS4_RS4_', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'} {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'} @@ -1667,6 +1668,7 @@ {'is_defined': True, 'name': '_ZTINSt3__118__time_get_storageIwEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__119__shared_weak_countE', 'size': 40, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTINSt3__119bad_expected_accessIvEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'size': 24, 'type': 'OBJECT'} @@ -1801,6 +1803,7 @@ {'is_defined': True, 'name': '_ZTSNSt3__118__time_get_storageIwEE', 'size': 32, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 69, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__119__shared_weak_countE', 'size': 30, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTSNSt3__119bad_expected_accessIvEE', 'size': 33, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 35, 'type': 'OBJECT'} @@ -1934,6 +1937,7 @@ {'is_defined': True, 'name': '_ZTVNSt3__117moneypunct_bynameIwLb1EEE', 'size': 112, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 120, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__119__shared_weak_countE', 'size': 56, 'type': 'OBJECT'} +{'is_defined': True, 'name': '_ZTVNSt3__119bad_expected_accessIvEE', 'size': 40, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'} {'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 96, 'type': 'OBJECT'} diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt index 208500ec14fcdc..a4a3fee8645710 100644 --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -10,6 +10,7 @@ set(LIBCXX_SOURCES chrono.cpp error_category.cpp exception.cpp + expected.cpp filesystem/filesystem_clock.cpp filesystem/filesystem_error.cpp filesystem/path_parser.h diff --git a/libcxx/src/expected.cpp b/libcxx/src/expected.cpp new file mode 100644 index 00000000000000..f30efb5164796b --- /dev/null +++ b/libcxx/src/expected.cpp @@ -0,0 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +_LIBCPP_BEGIN_NAMESPACE_STD +const char* bad_expected_access::what() const noexcept { return "bad access to std::expected"; } +_LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/src/functional.cpp b/libcxx/src/functional.cpp index 570bb78e150b7d..ef53e3e84da0e0 100644 --- a/libcxx/src/functional.cpp +++ b/libcxx/src/functional.cpp @@ -10,9 +10,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#ifdef _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION bad_function_call::~bad_function_call() noexcept {} -#endif #ifdef _LIBCPP_ABI_BAD_FUNCTION_CALL_GOOD_WHAT_MESSAGE const char* bad_function_call::what() const noexcept { return "std::bad_function_call"; } From 9ddedf07ed80076e0e419940753aeaaf719a09ec Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 16 Apr 2024 10:57:48 -0400 Subject: [PATCH 114/300] [libc++] Deprecate the C++20 synchronization library before C++20 (#86410) When we initially implemented the C++20 synchronization library, we reluctantly accepted for the implementation to be backported to C++03 upon request from the person who provided the patch. This was when we were only starting to have experience with the issues this can create, so we flinched. Nowadays, we have a much stricter stance about not backporting features to previous standards. We have recently started fixing several bugs (and near bugs) in our implementation of the synchronization library. A recurring theme during these reviews has been how difficult to understand the current code is, and upon inspection it becomes clear that being able to use a few recent C++ features (in particular lambdas) would help a great deal. The code would still be pretty intricate, but it would be a lot easier to reason about the flow of callbacks through things like __thread_poll_with_backoff. As a result, this patch deprecates support for the synchronization library before C++20. In the next release, we can remove that support entirely. --- libcxx/.clang-format | 1 + libcxx/docs/ReleaseNotes/19.rst | 4 +++ libcxx/include/__atomic/atomic.h | 12 ++++--- libcxx/include/__atomic/atomic_flag.h | 34 +++++++++++-------- libcxx/include/__config | 8 +++++ libcxx/include/barrier | 2 +- libcxx/include/latch | 2 +- libcxx/include/semaphore | 6 ++-- .../atomic_notify_all.pass.cpp | 5 ++- .../atomic_notify_one.pass.cpp | 5 ++- .../atomic_wait.pass.cpp | 5 ++- .../atomic_wait_explicit.pass.cpp | 5 ++- .../std/thread/thread.barrier/arrive.pass.cpp | 3 ++ .../thread.barrier/arrive_and_drop.pass.cpp | 3 ++ .../thread.barrier/arrive_and_wait.pass.cpp | 3 ++ .../thread/thread.barrier/completion.pass.cpp | 3 ++ .../thread.barrier/ctor.compile.pass.cpp | 3 ++ .../std/thread/thread.barrier/max.pass.cpp | 3 ++ .../thread.latch/arrive_and_wait.pass.cpp | 3 ++ .../thread/thread.latch/count_down.pass.cpp | 3 ++ .../std/thread/thread.latch/ctor.pass.cpp | 3 ++ .../test/std/thread/thread.latch/max.pass.cpp | 3 ++ .../std/thread/thread.latch/try_wait.pass.cpp | 3 ++ .../thread/thread.semaphore/acquire.pass.cpp | 3 ++ .../thread/thread.semaphore/binary.pass.cpp | 3 ++ .../thread.semaphore/ctor.compile.pass.cpp | 3 ++ .../std/thread/thread.semaphore/max.pass.cpp | 3 ++ .../thread/thread.semaphore/release.pass.cpp | 3 ++ .../thread/thread.semaphore/timed.pass.cpp | 3 ++ .../thread.semaphore/try_acquire.pass.cpp | 3 ++ 30 files changed, 117 insertions(+), 26 deletions(-) diff --git a/libcxx/.clang-format b/libcxx/.clang-format index 39ae1322ffa8a6..c37ab817bca906 100644 --- a/libcxx/.clang-format +++ b/libcxx/.clang-format @@ -24,6 +24,7 @@ AttributeMacros: [ '_LIBCPP_CONSTEXPR_SINCE_CXX23', '_LIBCPP_CONSTEXPR', '_LIBCPP_CONSTINIT', + '_LIBCPP_DEPRECATED_ATOMIC_SYNC', '_LIBCPP_DEPRECATED_IN_CXX11', '_LIBCPP_DEPRECATED_IN_CXX14', '_LIBCPP_DEPRECATED_IN_CXX17', diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index e5db17daa48233..45aac88e455024 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -75,6 +75,10 @@ Improvements and New Features Deprecations and Removals ------------------------- +- The C++20 synchronization library (````, ````, ``atomic::wait``, etc.) has been deprecated + in language modes prior to C++20. If you are using these features prior to C++20, please update to ``-std=c++20``. + In LLVM 20, the C++20 synchronization library will be removed entirely in language modes prior to C++20. + - TODO: The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable that was used to enable the safe mode has been deprecated and setting it triggers an error; use the ``LIBCXX_HARDENING_MODE`` CMake variable with the value ``extensive`` instead. Similarly, the ``_LIBCPP_ENABLE_ASSERTIONS`` macro has been deprecated (setting it to ``1`` still enables the extensive mode in diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h index 3dfb6937d0325e..bd3f659c22df01 100644 --- a/libcxx/include/__atomic/atomic.h +++ b/libcxx/include/__atomic/atomic.h @@ -462,22 +462,26 @@ atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __ // atomic_notify_one template -_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_one(volatile atomic<_Tp>* __o) _NOEXCEPT { +_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void +atomic_notify_one(volatile atomic<_Tp>* __o) _NOEXCEPT { __o->notify_one(); } template -_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_one(atomic<_Tp>* __o) _NOEXCEPT { +_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void +atomic_notify_one(atomic<_Tp>* __o) _NOEXCEPT { __o->notify_one(); } // atomic_notify_all template -_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_all(volatile atomic<_Tp>* __o) _NOEXCEPT { +_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void +atomic_notify_all(volatile atomic<_Tp>* __o) _NOEXCEPT { __o->notify_all(); } template -_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_all(atomic<_Tp>* __o) _NOEXCEPT { +_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void +atomic_notify_all(atomic<_Tp>* __o) _NOEXCEPT { __o->notify_all(); } diff --git a/libcxx/include/__atomic/atomic_flag.h b/libcxx/include/__atomic/atomic_flag.h index 084366237c16eb..3ec3366ecaaf98 100644 --- a/libcxx/include/__atomic/atomic_flag.h +++ b/libcxx/include/__atomic/atomic_flag.h @@ -49,22 +49,26 @@ struct atomic_flag { __cxx_atomic_store(&__a_, _LIBCPP_ATOMIC_FLAG_TYPE(false), __m); } - _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void wait(bool __v, memory_order __m = memory_order_seq_cst) const - volatile _NOEXCEPT { + _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void + wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile _NOEXCEPT { std::__atomic_wait(*this, _LIBCPP_ATOMIC_FLAG_TYPE(__v), __m); } - _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void + _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void wait(bool __v, memory_order __m = memory_order_seq_cst) const _NOEXCEPT { std::__atomic_wait(*this, _LIBCPP_ATOMIC_FLAG_TYPE(__v), __m); } - _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() volatile _NOEXCEPT { + _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() volatile _NOEXCEPT { + std::__atomic_notify_one(*this); + } + _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() _NOEXCEPT { std::__atomic_notify_one(*this); } - _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() _NOEXCEPT { std::__atomic_notify_one(*this); } _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() volatile _NOEXCEPT { std::__atomic_notify_all(*this); } - _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT { std::__atomic_notify_all(*this); } + _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT { + std::__atomic_notify_all(*this); + } #if _LIBCPP_STD_VER >= 20 _LIBCPP_HIDE_FROM_ABI constexpr atomic_flag() _NOEXCEPT : __a_(false) {} @@ -141,41 +145,43 @@ inline _LIBCPP_HIDE_FROM_ABI void atomic_flag_clear_explicit(atomic_flag* __o, m __o->clear(__m); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void +inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_wait(const volatile atomic_flag* __o, bool __v) _NOEXCEPT { __o->wait(__v); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void +inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_wait(const atomic_flag* __o, bool __v) _NOEXCEPT { __o->wait(__v); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void +inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_wait_explicit(const volatile atomic_flag* __o, bool __v, memory_order __m) _NOEXCEPT { __o->wait(__v, __m); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void +inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) _NOEXCEPT { __o->wait(__v, __m); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void +inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_notify_one(volatile atomic_flag* __o) _NOEXCEPT { __o->notify_one(); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_notify_one(atomic_flag* __o) _NOEXCEPT { +inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void +atomic_flag_notify_one(atomic_flag* __o) _NOEXCEPT { __o->notify_one(); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void +inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_notify_all(volatile atomic_flag* __o) _NOEXCEPT { __o->notify_all(); } -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_notify_all(atomic_flag* __o) _NOEXCEPT { +inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void +atomic_flag_notify_all(atomic_flag* __o) _NOEXCEPT { __o->notify_all(); } diff --git a/libcxx/include/__config b/libcxx/include/__config index e9fda9cd24ebbe..9b4155af1e3c65 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -956,6 +956,14 @@ typedef __char32_t char32_t; # define _LIBCPP_DEPRECATED_(m) # endif +# if _LIBCPP_STD_VER < 20 +# define _LIBCPP_DEPRECATED_ATOMIC_SYNC \ + _LIBCPP_DEPRECATED_("The C++20 synchronization library has been deprecated prior to C++20. Please update to " \ + "using -std=c++20 if you need to use these facilities.") +# else +# define _LIBCPP_DEPRECATED_ATOMIC_SYNC /* nothing */ +# endif + # if !defined(_LIBCPP_CXX03_LANG) # define _LIBCPP_DEPRECATED_IN_CXX11 _LIBCPP_DEPRECATED # else diff --git a/libcxx/include/barrier b/libcxx/include/barrier index c5fd84b91925b1..d776078267625a 100644 --- a/libcxx/include/barrier +++ b/libcxx/include/barrier @@ -257,7 +257,7 @@ public: # endif // !_LIBCPP_HAS_NO_TREE_BARRIER template -class barrier { +class _LIBCPP_DEPRECATED_ATOMIC_SYNC barrier { __barrier_base<_CompletionF> __b_; public: diff --git a/libcxx/include/latch b/libcxx/include/latch index 3cc72583811434..1937617f7dcc61 100644 --- a/libcxx/include/latch +++ b/libcxx/include/latch @@ -66,7 +66,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -class latch { +class _LIBCPP_DEPRECATED_ATOMIC_SYNC latch { __atomic_base __a_; public: diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore index 1375ec3f7c04b1..cb2f42c106ca85 100644 --- a/libcxx/include/semaphore +++ b/libcxx/include/semaphore @@ -127,7 +127,7 @@ private: }; template -class counting_semaphore { +class _LIBCPP_DEPRECATED_ATOMIC_SYNC counting_semaphore { __atomic_semaphore_base __semaphore_; public: @@ -172,7 +172,9 @@ public: } }; -using binary_semaphore = counting_semaphore<1>; +_LIBCPP_SUPPRESS_DEPRECATED_PUSH +using binary_semaphore _LIBCPP_DEPRECATED_ATOMIC_SYNC = counting_semaphore<1>; +_LIBCPP_SUPPRESS_DEPRECATED_POP _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp index 2b9f34b731f876..0ec530c922e707 100644 --- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp @@ -7,9 +7,12 @@ //===----------------------------------------------------------------------===// // // UNSUPPORTED: no-threads -// XFAIL: c++03 +// UNSUPPORTED: c++03 // XFAIL: !has-1024-bit-atomics +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp index dfa781c5660090..c21b67d479ae24 100644 --- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp @@ -7,9 +7,12 @@ //===----------------------------------------------------------------------===// // // UNSUPPORTED: no-threads -// XFAIL: c++03 +// UNSUPPORTED: c++03 // XFAIL: !has-1024-bit-atomics +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp index 38142b336e72ca..af99113f13499d 100644 --- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp @@ -7,9 +7,12 @@ //===----------------------------------------------------------------------===// // // UNSUPPORTED: no-threads -// XFAIL: c++03 +// UNSUPPORTED: c++03 // XFAIL: !has-1024-bit-atomics +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp index 2db95a0b67a7f0..bb8c64593b54b5 100644 --- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp +++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp @@ -7,9 +7,12 @@ //===----------------------------------------------------------------------===// // // UNSUPPORTED: no-threads -// XFAIL: c++03 +// UNSUPPORTED: c++03 // XFAIL: !has-1024-bit-atomics +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp index 18cdc6d654ac2b..d9d9c1dba6bbb8 100644 --- a/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp +++ b/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp index 3fc48261de1b12..aff7b26e16f70a 100644 --- a/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp +++ b/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp index 2aee8624ae3d52..8c45ba9278f289 100644 --- a/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp +++ b/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.barrier/completion.pass.cpp b/libcxx/test/std/thread/thread.barrier/completion.pass.cpp index 7354dbe6ffe8ae..633a0c8bf23664 100644 --- a/libcxx/test/std/thread/thread.barrier/completion.pass.cpp +++ b/libcxx/test/std/thread/thread.barrier/completion.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp b/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp index d47127a18613b7..fe7068d2a574ca 100644 --- a/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp +++ b/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // explicit barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF()); diff --git a/libcxx/test/std/thread/thread.barrier/max.pass.cpp b/libcxx/test/std/thread/thread.barrier/max.pass.cpp index ec03c5c87a09c1..b09a02e1bdef4c 100644 --- a/libcxx/test/std/thread/thread.barrier/max.pass.cpp +++ b/libcxx/test/std/thread/thread.barrier/max.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // #include diff --git a/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp b/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp index ddc06d2038cc82..8ca4f37b73b950 100644 --- a/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp +++ b/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.latch/count_down.pass.cpp b/libcxx/test/std/thread/thread.latch/count_down.pass.cpp index 1503c09509a6c8..eb524abd24b98a 100644 --- a/libcxx/test/std/thread/thread.latch/count_down.pass.cpp +++ b/libcxx/test/std/thread/thread.latch/count_down.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.latch/ctor.pass.cpp b/libcxx/test/std/thread/thread.latch/ctor.pass.cpp index 1983f6409cb5a5..bca4561bd2f742 100644 --- a/libcxx/test/std/thread/thread.latch/ctor.pass.cpp +++ b/libcxx/test/std/thread/thread.latch/ctor.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // inline constexpr explicit latch(ptrdiff_t __expected); diff --git a/libcxx/test/std/thread/thread.latch/max.pass.cpp b/libcxx/test/std/thread/thread.latch/max.pass.cpp index 8b9176c8cac570..bcf353ed9712ee 100644 --- a/libcxx/test/std/thread/thread.latch/max.pass.cpp +++ b/libcxx/test/std/thread/thread.latch/max.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // #include diff --git a/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp b/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp index 70ef2cdf712544..8f354463a8697d 100644 --- a/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp +++ b/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp b/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp index 3f6e3107e8bce0..22eed736c6b753 100644 --- a/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp +++ b/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp b/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp index 111a650b5ea39c..c01c78506587cd 100644 --- a/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp +++ b/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp b/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp index 28ccc0124d489e..dcc298ce11ce88 100644 --- a/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp +++ b/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // // constexpr explicit counting_semaphore(ptrdiff_t desired); diff --git a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp index ca7ad0c92e60e4..6f3ed5e345e0b5 100644 --- a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp +++ b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // #include diff --git a/libcxx/test/std/thread/thread.semaphore/release.pass.cpp b/libcxx/test/std/thread/thread.semaphore/release.pass.cpp index bf3dd7f7d814fe..3c4d179e504332 100644 --- a/libcxx/test/std/thread/thread.semaphore/release.pass.cpp +++ b/libcxx/test/std/thread/thread.semaphore/release.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp b/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp index 9fa01fc0359044..77f15ece221d43 100644 --- a/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp +++ b/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // diff --git a/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp b/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp index 0d0f7792592fbe..ec159daf87a3fb 100644 --- a/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp +++ b/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-threads // UNSUPPORTED: c++03, c++11 +// Until we drop support for the synchronization library in C++11/14/17 +// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS + // XFAIL: availability-synchronization_library-missing // From bd28889732e14ac6baca686c3ec99a82fc9cd89d Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 16 Apr 2024 07:54:51 -0700 Subject: [PATCH 115/300] [RISCV] Add coverage for strength reduction of mul 2^N +/- 3/5/9 --- llvm/test/CodeGen/RISCV/rv64zba.ll | 60 ++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index 0d1d4838c61133..a84b9e5e7962f6 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -567,6 +567,66 @@ define i64 @mul96(i64 %a) { ret i64 %c } +define i64 @mul119(i64 %a) { +; CHECK-LABEL: mul119: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 119 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 119 + ret i64 %c +} + +define i64 @mul123(i64 %a) { +; CHECK-LABEL: mul123: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 123 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 123 + ret i64 %c +} + +define i64 @mul125(i64 %a) { +; CHECK-LABEL: mul125: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 125 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 125 + ret i64 %c +} + +define i64 @mul131(i64 %a) { +; CHECK-LABEL: mul131: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 131 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 131 + ret i64 %c +} + +define i64 @mul133(i64 %a) { +; CHECK-LABEL: mul133: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 133 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 133 + ret i64 %c +} + +define i64 @mul137(i64 %a) { +; CHECK-LABEL: mul137: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 137 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 137 + ret i64 %c +} + define i64 @mul160(i64 %a) { ; RV64I-LABEL: mul160: ; RV64I: # %bb.0: From 1334c034a73b7bf8a7af08be1c33d24a58127c47 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Tue, 16 Apr 2024 10:01:34 -0500 Subject: [PATCH 116/300] [flang] Fix test after 4078afc6d23e25df6baedad61b224ef86a94d42f This tests requires the OpenMP runtime to be present, but the way that the lit config detects it fails when "openmp" is added to RUNTIMES instead of PROJECTS. This caused the tests to be skipped as unsupported in local and upstream tests. The actual bug was a missing word in the message, and putting the check at the wrong line. --- flang/test/Semantics/OpenMP/clause-validity01.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90 index 74f154bb0ad67d..21b99cb82549ad 100644 --- a/flang/test/Semantics/OpenMP/clause-validity01.f90 +++ b/flang/test/Semantics/OpenMP/clause-validity01.f90 @@ -342,8 +342,8 @@ a = 1.0 !ERROR: COPYPRIVATE clause is not allowed on the END WORKSHARE directive !$omp end workshare nowait copyprivate(a) + !ERROR: NOWAIT clause is not allowed on the OMP WORKSHARE directive, use it on OMP END WORKSHARE directive !$omp workshare nowait - !ERROR: NOWAIT clause is not allowed on the WORKSHARE directive, use it on OMP END WORKSHARE directive !$omp end workshare !$omp end parallel From bf1ad1d267b1f911cb9846403d2c3d3250a40870 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 16 Apr 2024 16:01:57 +0100 Subject: [PATCH 117/300] [VectorCombine][X86] Add initial shuffle-of-shuffles.ll test cover for #88743 --- .../VectorCombine/X86/shuffle-of-shuffles.ll | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll new file mode 100644 index 00000000000000..b5b5bb997c6c7a --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s + +; TODO: fold to identity + +define <8 x i32> @concat_extract_subvectors(<8 x i32> %x) { +; CHECK-LABEL: define <8 x i32> @concat_extract_subvectors( +; CHECK-SAME: <8 x i32> [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[LO:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[HI:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[CONCAT:%.*]] = shufflevector <4 x i32> [[LO]], <4 x i32> [[HI]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[CONCAT]] +; + %lo = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> + %hi = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> + %concat = shufflevector <4 x i32> %lo, <4 x i32> %hi, <8 x i32> + ret <8 x i32> %concat +} + +; negative test - shuffle contains undef + +define <8 x i32> @concat_extract_subvectors_undef(<8 x i32> %x) { +; CHECK-LABEL: define <8 x i32> @concat_extract_subvectors_undef( +; CHECK-SAME: <8 x i32> [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LO:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> undef, <4 x i32> +; CHECK-NEXT: [[HI:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> undef, <4 x i32> +; CHECK-NEXT: [[CONCAT:%.*]] = shufflevector <4 x i32> [[LO]], <4 x i32> [[HI]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[CONCAT]] +; + %lo = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> + %hi = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> + %concat = shufflevector <4 x i32> %lo, <4 x i32> %hi, <8 x i32> + ret <8 x i32> %concat +} + +; negative test - shuffle contains poision + +define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) { +; CHECK-LABEL: define <8 x i32> @concat_extract_subvectors_poison( +; CHECK-SAME: <8 x i32> [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LO:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[HI:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[CONCAT:%.*]] = shufflevector <4 x i32> [[LO]], <4 x i32> [[HI]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[CONCAT]] +; + %lo = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> + %hi = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> + %concat = shufflevector <4 x i32> %lo, <4 x i32> %hi, <8 x i32> + ret <8 x i32> %concat +} From f8e2ec13a8c6d33cb7b4f37869b4429ddcf43f01 Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Tue, 16 Apr 2024 08:29:09 -0700 Subject: [PATCH 118/300] Revert "Add asan tests for libsanitizers. (#88349)" This reverts commit 82f479ba315a417b6cd01a8c2efdc15c26689f2e due to bot breakage. --- lldb/test/API/functionalities/asan/Makefile | 6 +- .../functionalities/asan/TestMemoryHistory.py | 73 +------------------ .../functionalities/asan/TestReportData.py | 20 +---- .../API/functionalities/libsanitizers/util.py | 3 - 4 files changed, 5 insertions(+), 97 deletions(-) delete mode 100644 lldb/test/API/functionalities/libsanitizers/util.py diff --git a/lldb/test/API/functionalities/asan/Makefile b/lldb/test/API/functionalities/asan/Makefile index d66696fed7078f..4913a18d8cc6f9 100644 --- a/lldb/test/API/functionalities/asan/Makefile +++ b/lldb/test/API/functionalities/asan/Makefile @@ -1,8 +1,4 @@ C_SOURCES := main.c -asan: CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info -asan: all - -libsanitizers: CFLAGS_EXTRAS := -fsanitize=address -fsanitize-stable-abi -g -gcolumn-info -libsanitizers: all +CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info include Makefile.rules diff --git a/lldb/test/API/functionalities/asan/TestMemoryHistory.py b/lldb/test/API/functionalities/asan/TestMemoryHistory.py index ee7939203ead18..00162ae8822c74 100644 --- a/lldb/test/API/functionalities/asan/TestMemoryHistory.py +++ b/lldb/test/API/functionalities/asan/TestMemoryHistory.py @@ -9,21 +9,15 @@ from lldbsuite.test import lldbplatform from lldbsuite.test import lldbutil -from functionalities.libsanitizers.util import no_libsanitizers class AsanTestCase(TestBase): @skipIfFreeBSD # llvm.org/pr21136 runtimes not yet available by default @expectedFailureNetBSD @skipUnlessAddressSanitizer def test(self): - self.build(make_targets=["asan"]) + self.build() self.asan_tests() - @skipIf(oslist=no_match(["macosx"])) - def test_libsanitizers_asan(self): - self.build(make_targets=["libsanitizers"]) - self.libsanitizer_tests() - def setUp(self): # Call super's setUp(). TestBase.setUp(self) @@ -32,71 +26,6 @@ def setUp(self): self.line_free = line_number("main.c", "// free line") self.line_breakpoint = line_number("main.c", "// break line") - # Test line numbers: rdar://126237493 - def libsanitizer_tests(self): - target = self.createTestTarget() - - if no_libsanitizers(self): - self.skipTest("libsanitizers not found") - - self.runCmd( - "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0" - ) - - self.runCmd("run") - - # In libsanitizers, memory history is not supported until a report has been generated - self.expect( - "thread list", - "Process should be stopped due to ASan report", - substrs=["stopped", "stop reason = Use of deallocated memory"], - ) - - # test the 'memory history' command - self.expect( - "memory history 'pointer'", - substrs=[ - "Memory deallocated by Thread", - "a.out`f2", - "main.c", - "Memory allocated by Thread", - "a.out`f1", - "main.c", - ], - ) - - # do the same using SB API - process = self.dbg.GetSelectedTarget().process - val = ( - process.GetSelectedThread().GetSelectedFrame().EvaluateExpression("pointer") - ) - addr = val.GetValueAsUnsigned() - threads = process.GetHistoryThreads(addr) - self.assertEqual(threads.GetSize(), 2) - - history_thread = threads.GetThreadAtIndex(0) - self.assertTrue(history_thread.num_frames >= 2) - self.assertEqual( - history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), - "main.c", - ) - - history_thread = threads.GetThreadAtIndex(1) - self.assertTrue(history_thread.num_frames >= 2) - self.assertEqual( - history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), - "main.c", - ) - - # let's free the container (SBThreadCollection) and see if the - # SBThreads still live - threads = None - self.assertTrue(history_thread.num_frames >= 2) - self.assertEqual( - history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), - "main.c", - ) - def asan_tests(self): target = self.createTestTarget() diff --git a/lldb/test/API/functionalities/asan/TestReportData.py b/lldb/test/API/functionalities/asan/TestReportData.py index de0c1206a57ad6..543c5fe66a208d 100644 --- a/lldb/test/API/functionalities/asan/TestReportData.py +++ b/lldb/test/API/functionalities/asan/TestReportData.py @@ -9,7 +9,6 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil -from functionalities.libsanitizers.util import no_libsanitizers class AsanTestReportDataCase(TestBase): @skipIfFreeBSD # llvm.org/pr21136 runtimes not yet available by default @@ -17,14 +16,9 @@ class AsanTestReportDataCase(TestBase): @skipUnlessAddressSanitizer @skipIf(archs=["i386"], bugnumber="llvm.org/PR36710") def test(self): - self.build(make_targets=["asan"]) + self.build() self.asan_tests() - @skipIf(oslist=no_match(["macosx"])) - def test_libsanitizers_asan(self): - self.build(make_targets=["libsanitizers"]) - self.asan_tests(libsanitizers=True) - def setUp(self): # Call super's setUp(). TestBase.setUp(self) @@ -35,18 +29,10 @@ def setUp(self): self.line_crash = line_number("main.c", "// BOOM line") self.col_crash = 16 - def asan_tests(self, libsanitizers=False): + def asan_tests(self): target = self.createTestTarget() - if libsanitizers and no_libsanitizers(self): - self.skipTest("libsanitizers not found") - - if libsanitizers: - self.runCmd( - "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0" - ) - else: - self.registerSanitizerLibrariesWithTarget(target) + self.registerSanitizerLibrariesWithTarget(target) self.runCmd("run") diff --git a/lldb/test/API/functionalities/libsanitizers/util.py b/lldb/test/API/functionalities/libsanitizers/util.py deleted file mode 100644 index ad68541aba8d05..00000000000000 --- a/lldb/test/API/functionalities/libsanitizers/util.py +++ /dev/null @@ -1,3 +0,0 @@ -def no_libsanitizers(testbase): - testbase.runCmd("image list libsystem_sanitizers.dylib", check=False) - return not "libsystem_sanitizers.dylib" in testbase.res.GetOutput() From 8cee94e989b5bf6fb6455087d48eb6c6e0e23c54 Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Tue, 16 Apr 2024 16:32:57 +0100 Subject: [PATCH 119/300] [RISCV] Fix obvious copy paste error. CASE_VFMA_OPCODE_VV and CASE_VFMA_CHANGE_OPCODE_VV need to match up if we are are to avoid "Unexpected opcode" errors, but in CASE_VFMA_CHANGE_OPCODE_VV, CASE_VFMA_CHANGE_OPCODE_LMULS_MF2 had mistakenly been used instead of CASE_VFMA_CHANGE_OPCODE_LMULS_MF4. --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 668062c8d33f6f..14b5cbea71722f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -3016,7 +3016,7 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE, SEW) #define CASE_VFMA_CHANGE_OPCODE_VV(OLDOP, NEWOP) \ - CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E16) \ + CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VV, E16) \ CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E32) \ CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VV, E64) From 51b42b762112f2e77d032efd16fa0d9d31cde494 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 16 Apr 2024 17:45:02 +0200 Subject: [PATCH 120/300] Revert "[Sema] Mark alias/ifunc targets used and consider mangled names" (#88919) Reverts llvm/llvm-project#87130 Bot is broken with clang crash: https://lab.llvm.org/buildbot/#/builders/272/builds/14063/steps/6/logs/stdio --- clang/lib/Sema/CMakeLists.txt | 1 - clang/lib/Sema/SemaDeclAttr.cpp | 44 +++++-------------- clang/test/AST/ast-dump-attr-json.cpp | 1 - clang/test/Sema/alias-unused-win.cpp | 2 +- clang/test/Sema/alias-unused.cpp | 16 +++---- .../llvm-project-overlay/clang/BUILD.bazel | 1 - 6 files changed, 19 insertions(+), 46 deletions(-) diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt index a96439df664228..ab3b813a9ccd97 100644 --- a/clang/lib/Sema/CMakeLists.txt +++ b/clang/lib/Sema/CMakeLists.txt @@ -1,6 +1,5 @@ set(LLVM_LINK_COMPONENTS Core - Demangle FrontendHLSL FrontendOpenMP MC diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index d26f130b5774ce..b7b1fbc625a150 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -45,7 +45,6 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/STLForwardCompat.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/Demangle/Demangle.h" #include "llvm/IR/Assumptions.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Support/Error.h" @@ -1984,36 +1983,6 @@ static void handleWeakRefAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(::new (S.Context) WeakRefAttr(S.Context, AL)); } -// Mark alias/ifunc target as used. Due to name mangling, we look up the -// demangled name ignoring parameters (not supported by microsoftDemangle -// https://github.com/llvm/llvm-project/issues/88825). This should handle the -// majority of use cases while leaving namespace scope names unmarked. -static void markUsedForAliasOrIfunc(Sema &S, Decl *D, const ParsedAttr &AL, - StringRef Str) { - std::unique_ptr Demangled; - if (S.getASTContext().getCXXABIKind() != TargetCXXABI::Microsoft) - Demangled.reset(llvm::itaniumDemangle(Str, /*ParseParams=*/false)); - std::unique_ptr MC(S.Context.createMangleContext()); - SmallString<256> Name; - - const DeclarationNameInfo Target( - &S.Context.Idents.get(Demangled ? Demangled.get() : Str), AL.getLoc()); - LookupResult LR(S, Target, Sema::LookupOrdinaryName); - if (S.LookupName(LR, S.TUScope)) { - for (NamedDecl *ND : LR) { - if (MC->shouldMangleDeclName(ND)) { - llvm::raw_svector_ostream Out(Name); - Name.clear(); - MC->mangleName(GlobalDecl(ND), Out); - } else { - Name = ND->getIdentifier()->getName(); - } - if (Name == Str) - ND->markUsed(S.Context); - } - } -} - static void handleIFuncAttr(Sema &S, Decl *D, const ParsedAttr &AL) { StringRef Str; if (!S.checkStringLiteralArgumentAttr(AL, 0, Str)) @@ -2026,7 +1995,6 @@ static void handleIFuncAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - markUsedForAliasOrIfunc(S, D, AL, Str); D->addAttr(::new (S.Context) IFuncAttr(S.Context, AL, Str)); } @@ -2061,7 +2029,17 @@ static void handleAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { } } - markUsedForAliasOrIfunc(S, D, AL, Str); + // Mark target used to prevent unneeded-internal-declaration warnings. + if (!S.LangOpts.CPlusPlus) { + // FIXME: demangle Str for C++, as the attribute refers to the mangled + // linkage name, not the pre-mangled identifier. + const DeclarationNameInfo target(&S.Context.Idents.get(Str), AL.getLoc()); + LookupResult LR(S, target, Sema::LookupOrdinaryName); + if (S.LookupQualifiedName(LR, S.getCurLexicalContext())) + for (NamedDecl *ND : LR) + ND->markUsed(S.Context); + } + D->addAttr(::new (S.Context) AliasAttr(S.Context, AL, Str)); } diff --git a/clang/test/AST/ast-dump-attr-json.cpp b/clang/test/AST/ast-dump-attr-json.cpp index 883e584bfedf07..051c2956abfdf7 100644 --- a/clang/test/AST/ast-dump-attr-json.cpp +++ b/clang/test/AST/ast-dump-attr-json.cpp @@ -46,7 +46,6 @@ __thread __attribute__ ((tls_model ("local-exec"))) int tls_model_var; // CHECK-NEXT: "tokLen": 11 // CHECK-NEXT: } // CHECK-NEXT: }, -// CHECK-NEXT: "isUsed": true, // CHECK-NEXT: "name": "global_decl", // CHECK-NEXT: "mangledName": "global_decl", // CHECK-NEXT: "type": { diff --git a/clang/test/Sema/alias-unused-win.cpp b/clang/test/Sema/alias-unused-win.cpp index 97d57a3bbd1e31..47c96d41175179 100644 --- a/clang/test/Sema/alias-unused-win.cpp +++ b/clang/test/Sema/alias-unused-win.cpp @@ -7,7 +7,7 @@ extern "C" { static int f(void) { return 42; } // cxx-warning{{unused function 'f'}} int g(void) __attribute__((alias("f"))); -static int foo [] = { 42, 0xDEAD }; +static int foo [] = { 42, 0xDEAD }; // cxx-warning{{variable 'foo' is not needed and will not be emitted}} extern typeof(foo) bar __attribute__((unused, alias("foo"))); static int __attribute__((overloadable)) f0(int x) { return x; } // expected-warning{{unused function 'f0'}} diff --git a/clang/test/Sema/alias-unused.cpp b/clang/test/Sema/alias-unused.cpp index c0b541c880e525..dc8e46f072d74d 100644 --- a/clang/test/Sema/alias-unused.cpp +++ b/clang/test/Sema/alias-unused.cpp @@ -14,26 +14,24 @@ extern typeof(foo) bar __attribute__((unused, alias("foo"))); /// We report a warning in C++ mode because the internal linkage `resolver` gets /// mangled as it does not have a language linkage. GCC does not mangle /// `resolver` or report a warning. -static int (*resolver(void))(void) { return f; } // cxx-warning{{unused function 'resolver'}} +static int (*resolver(void))(void) { return f; } // expected-warning{{unused function 'resolver'}} int ifunc(void) __attribute__((ifunc("resolver"))); -static int __attribute__((overloadable)) f0(int x) { return x; } +static int __attribute__((overloadable)) f0(int x) { return x; } // expected-warning{{unused function 'f0'}} static float __attribute__((overloadable)) f0(float x) { return x; } // expected-warning{{unused function 'f0'}} int g0(void) __attribute__((alias("_ZL2f0i"))); #ifdef __cplusplus -static int f1() { return 42; } +static int f1() { return 42; } // expected-warning{{unused function 'f1'}} int g1(void) __attribute__((alias("_ZL2f1v"))); } -/// We demangle alias/ifunc target and mark all found functions as used. - -static int f2(int) { return 42; } // cxx-warning{{unused function 'f2'}} -static int f2() { return 42; } +static int f2(int) { return 42; } // expected-warning{{unused function 'f2'}} +static int f2() { return 42; } // expected-warning{{unused function 'f2'}} int g2() __attribute__((alias("_ZL2f2v"))); -static int (*resolver1())() { return f; } // cxx-warning{{unused function 'resolver1'}} -static int (*resolver1(int))() { return f; } +static int (*resolver1())() { return f; } // expected-warning{{unused function 'resolver1'}} +static int (*resolver1(int))() { return f; } // expected-warning{{unused function 'resolver1'}} int ifunc1() __attribute__((ifunc("_ZL9resolver1i"))); /// TODO: We should report "unused function" for f3(int). diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 725ac6bb38120b..c2f77e3abca0e6 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1136,7 +1136,6 @@ cc_library( "//llvm:AllTargetsAsmParsers", "//llvm:AllTargetsCodeGens", "//llvm:Core", - "//llvm:Demangle", "//llvm:FrontendHLSL", "//llvm:FrontendOpenMP", "//llvm:MC", From 9d111286322ec99b32467eef3aeec6b588c49f18 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 16 Apr 2024 16:48:59 +0100 Subject: [PATCH 121/300] [libclc] Improve clarity of CMake foreach. NFC. Should be a bit easier to read. --- libclc/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index ed2764847e709e..f605c3bbbe9dce 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -50,7 +50,7 @@ if( LIBCLC_STANDALONE_BUILD OR CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DI endif() # Import required tools as targets - foreach( tool clang llvm-as llvm-link opt ) + foreach( tool IN ITEMS clang llvm-as llvm-link opt ) find_program( LLVM_TOOL_${tool} ${tool} PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH ) add_executable( libclc::${tool} IMPORTED GLOBAL ) set_target_properties( libclc::${tool} PROPERTIES IMPORTED_LOCATION ${LLVM_TOOL_${tool}} ) @@ -68,7 +68,7 @@ else() message(FATAL_ERROR "Clang is not enabled, but is required to build libclc in-tree") endif() - foreach( tool clang llvm-as llvm-link opt ) + foreach( tool IN ITEMS clang llvm-as llvm-link opt ) add_executable(libclc::${tool} ALIAS ${tool}) endforeach() endif() @@ -181,7 +181,7 @@ install( FILES ${CMAKE_CURRENT_BINARY_DIR}/libclc.pc DESTINATION "${CMAKE_INSTAL install( DIRECTORY generic/include/clc DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" ) if( ENABLE_RUNTIME_SUBNORMAL ) - foreach( file subnormal_use_default subnormal_disable ) + foreach( file IN ITEMS subnormal_use_default subnormal_disable ) link_bc( TARGET ${file} INPUTS ${PROJECT_SOURCE_DIR}/generic/lib/${file}.ll From 2704ebaf2885a16155ab7144f8dd0dd459d77089 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Tue, 16 Apr 2024 08:53:12 -0700 Subject: [PATCH 122/300] [flang][runtime] Create CUDA PTX OBJECT library target for F18 runtime CUDA build. (#88821) This is to experiment with distributing F18 runtime CUDA library in the form of a pure PTX library. The change is under FLANG_EXPERIMENTAL_CUDA_RUNTIME CMake control. --- flang/cmake/modules/AddFlangOffloadRuntime.cmake | 6 +++++- flang/lib/Decimal/CMakeLists.txt | 2 +- flang/runtime/CMakeLists.txt | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index 6fb6213e90fc49..e34d3851187acf 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -10,7 +10,7 @@ set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING set(FLANG_OMP_DEVICE_ARCHITECTURES "all" CACHE STRING "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')") -macro(enable_cuda_compilation files) +macro(enable_cuda_compilation name files) if (FLANG_EXPERIMENTAL_CUDA_RUNTIME) if (BUILD_SHARED_LIBS) message(FATAL_ERROR @@ -52,6 +52,10 @@ macro(enable_cuda_compilation files) include_directories(AFTER ${FLANG_LIBCUDACXX_PATH}/include) add_compile_definitions(RT_USE_LIBCUDACXX=1) endif() + + # Add an OBJECT library consisting of CUDA PTX. + llvm_add_library(${name}PTX OBJECT PARTIAL_SOURCES_INTENDED ${files}) + set_property(TARGET obj.${name}PTX PROPERTY CUDA_PTX_COMPILATION ON) endif() endmacro() diff --git a/flang/lib/Decimal/CMakeLists.txt b/flang/lib/Decimal/CMakeLists.txt index 3d562b8e3ce1e5..880b190f1c5815 100644 --- a/flang/lib/Decimal/CMakeLists.txt +++ b/flang/lib/Decimal/CMakeLists.txt @@ -55,7 +55,7 @@ set(sources ) include(AddFlangOffloadRuntime) -enable_cuda_compilation("${sources}") +enable_cuda_compilation(FortranDecimal "${sources}") enable_omp_offload_compilation("${sources}") add_flang_library(FortranDecimal INSTALL_WITH_TOOLCHAIN ${sources}) diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt index 2a65a22ab674c4..bdd0e07bbfd4d1 100644 --- a/flang/runtime/CMakeLists.txt +++ b/flang/runtime/CMakeLists.txt @@ -224,7 +224,7 @@ set(supported_files utf.cpp ) -enable_cuda_compilation("${supported_files}") +enable_cuda_compilation(FortranRuntime "${supported_files}") enable_omp_offload_compilation("${supported_files}") if (NOT TARGET FortranFloat128Math) From a79783d7ad00c4c10c30f637f4bf13551e47f3dd Mon Sep 17 00:00:00 2001 From: Robin Caloudis Date: Tue, 16 Apr 2024 17:58:11 +0200 Subject: [PATCH 123/300] [libc][fenv] Use proxy header (#88787) Include types `fexcept_t` and `fenv_t ` from corresponding proxy headers, as they are available since https://github.com/llvm/llvm-project/pull/88467. --- libc/src/fenv/fegetexceptflag.h | 2 +- libc/src/fenv/fesetexceptflag.h | 2 +- libc/src/fenv/feupdateenv.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/src/fenv/fegetexceptflag.h b/libc/src/fenv/fegetexceptflag.h index ad72161e536f83..fcb9598658d43b 100644 --- a/libc/src/fenv/fegetexceptflag.h +++ b/libc/src/fenv/fegetexceptflag.h @@ -9,7 +9,7 @@ #ifndef LLVM_LIBC_SRC_FENV_FEGETEXCEPTFLAG_H #define LLVM_LIBC_SRC_FENV_FEGETEXCEPTFLAG_H -#include +#include "hdr/types/fexcept_t.h" namespace LIBC_NAMESPACE { diff --git a/libc/src/fenv/fesetexceptflag.h b/libc/src/fenv/fesetexceptflag.h index 15e62eda1b840c..a018358dc9dfcc 100644 --- a/libc/src/fenv/fesetexceptflag.h +++ b/libc/src/fenv/fesetexceptflag.h @@ -9,7 +9,7 @@ #ifndef LLVM_LIBC_SRC_FENV_FESETEXCEPTFLAG_H #define LLVM_LIBC_SRC_FENV_FESETEXCEPTFLAG_H -#include +#include "hdr/types/fexcept_t.h" namespace LIBC_NAMESPACE { diff --git a/libc/src/fenv/feupdateenv.cpp b/libc/src/fenv/feupdateenv.cpp index 7e81b9476da91b..06648635381555 100644 --- a/libc/src/fenv/feupdateenv.cpp +++ b/libc/src/fenv/feupdateenv.cpp @@ -10,7 +10,7 @@ #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/common.h" -#include +#include "hdr/types/fenv_t.h" namespace LIBC_NAMESPACE { From 38895e6578f7728cfb3d41d0880a0e9b358d9afd Mon Sep 17 00:00:00 2001 From: Robin Caloudis Date: Tue, 16 Apr 2024 17:58:56 +0200 Subject: [PATCH 124/300] [libc][fenv] Remove unnecessary dependencies (#88788) Remove the fenv macro dependency from the CMake files as the underlying targets do not make use of it. Note that we do not have to worry about [corresponding Bazel targets](https://github.com/llvm/llvm-project/blob/main/utils/bazel/llvm-project-overlay/libc/BUILD.bazel#L1138-L1288), as they look good. --- libc/src/fenv/CMakeLists.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/libc/src/fenv/CMakeLists.txt b/libc/src/fenv/CMakeLists.txt index a28a7ca4c2d821..17e99474120627 100644 --- a/libc/src/fenv/CMakeLists.txt +++ b/libc/src/fenv/CMakeLists.txt @@ -17,7 +17,6 @@ add_entrypoint_object( HDRS fesetround.h DEPENDS - libc.hdr.fenv_macros libc.src.__support.FPUtil.fenv_impl COMPILE_OPTIONS -O2 @@ -30,7 +29,6 @@ add_entrypoint_object( HDRS feclearexcept.h DEPENDS - libc.hdr.fenv_macros libc.src.__support.FPUtil.fenv_impl COMPILE_OPTIONS -O2 @@ -43,7 +41,6 @@ add_entrypoint_object( HDRS feraiseexcept.h DEPENDS - libc.hdr.fenv_macros libc.src.__support.FPUtil.fenv_impl COMPILE_OPTIONS -O2 @@ -56,7 +53,6 @@ add_entrypoint_object( HDRS fetestexcept.h DEPENDS - libc.hdr.fenv_macros libc.src.__support.FPUtil.fenv_impl COMPILE_OPTIONS -O2 @@ -69,7 +65,6 @@ add_entrypoint_object( HDRS fegetenv.h DEPENDS - libc.hdr.fenv_macros libc.hdr.types.fenv_t libc.src.__support.FPUtil.fenv_impl COMPILE_OPTIONS @@ -83,7 +78,6 @@ add_entrypoint_object( HDRS fesetenv.h DEPENDS - libc.hdr.fenv_macros libc.hdr.types.fenv_t libc.src.__support.FPUtil.fenv_impl COMPILE_OPTIONS @@ -111,7 +105,6 @@ add_entrypoint_object( HDRS fesetexcept.h DEPENDS - libc.hdr.fenv_macros libc.src.__support.FPUtil.fenv_impl COMPILE_OPTIONS -O2 @@ -166,7 +159,6 @@ add_entrypoint_object( HDRS feenableexcept.h DEPENDS - libc.hdr.fenv_macros libc.src.__support.FPUtil.fenv_impl COMPILE_OPTIONS -O2 @@ -179,7 +171,6 @@ add_entrypoint_object( HDRS fedisableexcept.h DEPENDS - libc.hdr.fenv_macros libc.src.__support.FPUtil.fenv_impl COMPILE_OPTIONS -O2 @@ -192,7 +183,6 @@ add_entrypoint_object( HDRS fegetexcept.h DEPENDS - libc.hdr.fenv_macros libc.src.__support.FPUtil.fenv_impl COMPILE_OPTIONS -O2 From 1c6b0f779f66494cb597884c1a52e377bde4bc54 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 16 Apr 2024 17:07:46 +0100 Subject: [PATCH 125/300] [RemoveDI] Add support for debug records to debugify (#87383) This patch changes debugify to support debug variable records, and subsequently to no longer convert modules automatically to intrinsics when entering debugify. --- llvm/lib/CodeGen/MachineDebugify.cpp | 18 ++- llvm/lib/Transforms/Utils/Debugify.cpp | 111 +++++++----------- .../check-line-and-variables-x.mir | 1 + .../MIRDebugify/check-line-and-variables.ll | 1 + .../MIRDebugify/check-line-and-variables.mir | 2 + .../MIRDebugify/locations-and-values.mir | 4 + .../MIRDebugify/multifunction-module.mir | 3 +- .../DebugInfo/debugify-bogus-dbg-value.ll | 1 + llvm/test/DebugInfo/debugify-each.ll | 34 ++++++ llvm/test/DebugInfo/debugify-export.ll | 3 + llvm/test/DebugInfo/debugify-ignore-phi.ll | 1 + .../debugify-original-no-dbg-info.ll | 1 + .../debugify-report-missing-locs-only.ll | 1 + llvm/test/DebugInfo/debugify.ll | 27 +++++ llvm/test/DebugInfo/pr37964.ll | 1 + .../test/DebugInfo/salvage-cast-debug-info.ll | 2 +- llvm/test/DebugInfo/verify-di-preserve.ll | 4 +- 17 files changed, 142 insertions(+), 73 deletions(-) diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp index c264e199cf4722..bffdd51bfbca76 100644 --- a/llvm/lib/CodeGen/MachineDebugify.cpp +++ b/llvm/lib/CodeGen/MachineDebugify.cpp @@ -65,6 +65,7 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI, // all the others. Function *DbgValF = M.getFunction("llvm.dbg.value"); DbgValueInst *EarliestDVI = nullptr; + DbgVariableRecord *EarliestDVR = nullptr; DenseMap Line2Var; DIExpression *Expr = nullptr; if (DbgValF) { @@ -80,6 +81,20 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI, Expr = DVI->getExpression(); } } + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) { + if (!DVR.isDbgValue()) + continue; + unsigned Line = DVR.getDebugLoc().getLine(); + assert(Line != 0 && "debugify should not insert line 0 locations"); + Line2Var[Line] = DVR.getVariable(); + if (!EarliestDVR || Line < EarliestDVR->getDebugLoc().getLine()) + EarliestDVR = &DVR; + Expr = DVR.getExpression(); + } + } + } if (Line2Var.empty()) return true; @@ -109,7 +124,8 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI, // Find a suitable local variable for the DBG_VALUE. unsigned Line = MI.getDebugLoc().getLine(); if (!Line2Var.count(Line)) - Line = EarliestDVI->getDebugLoc().getLine(); + Line = EarliestDVI ? EarliestDVI->getDebugLoc().getLine() + : EarliestDVR->getDebugLoc().getLine(); DILocalVariable *LocalVar = Line2Var[Line]; assert(LocalVar && "No variable for current line?"); VarSet.insert(LocalVar); diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index 200bad22148f06..fcc82eadac36cf 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -87,10 +87,6 @@ bool llvm::applyDebugifyMetadata( return false; } - bool NewDebugMode = M.IsNewDbgInfoFormat; - if (NewDebugMode) - M.convertFromNewDbgValues(); - DIBuilder DIB(M); LLVMContext &Ctx = M.getContext(); auto *Int32Ty = Type::getInt32Ty(Ctx); @@ -214,9 +210,6 @@ bool llvm::applyDebugifyMetadata( if (!M.getModuleFlag(DIVersionKey)) M.addModuleFlag(Module::Warning, DIVersionKey, DEBUG_METADATA_VERSION); - if (NewDebugMode) - M.convertToNewDbgValues(); - return true; } @@ -311,10 +304,6 @@ bool llvm::collectDebugInfoMetadata(Module &M, return false; } - bool NewDebugMode = M.IsNewDbgInfoFormat; - if (NewDebugMode) - M.convertFromNewDbgValues(); - uint64_t FunctionsCnt = DebugInfoBeforePass.DIFunctions.size(); // Visit each instruction. for (Function &F : Functions) { @@ -349,20 +338,23 @@ bool llvm::collectDebugInfoMetadata(Module &M, // Cllect dbg.values and dbg.declare. if (DebugifyLevel > Level::Locations) { - if (auto *DVI = dyn_cast(&I)) { + auto HandleDbgVariable = [&](auto *DbgVar) { if (!SP) - continue; + return; // Skip inlined variables. - if (I.getDebugLoc().getInlinedAt()) - continue; + if (DbgVar->getDebugLoc().getInlinedAt()) + return; // Skip undef values. - if (DVI->isKillLocation()) - continue; + if (DbgVar->isKillLocation()) + return; - auto *Var = DVI->getVariable(); + auto *Var = DbgVar->getVariable(); DebugInfoBeforePass.DIVariables[Var]++; - continue; - } + }; + for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) + HandleDbgVariable(&DVR); + if (auto *DVI = dyn_cast(&I)) + HandleDbgVariable(DVI); } // Skip debug instructions other than dbg.value and dbg.declare. @@ -379,9 +371,6 @@ bool llvm::collectDebugInfoMetadata(Module &M, } } - if (NewDebugMode) - M.convertToNewDbgValues(); - return true; } @@ -561,10 +550,6 @@ bool llvm::checkDebugInfoMetadata(Module &M, return false; } - bool NewDebugMode = M.IsNewDbgInfoFormat; - if (NewDebugMode) - M.convertFromNewDbgValues(); - // Map the debug info holding DIs after a pass. DebugInfoPerPass DebugInfoAfterPass; @@ -599,20 +584,23 @@ bool llvm::checkDebugInfoMetadata(Module &M, // Collect dbg.values and dbg.declares. if (DebugifyLevel > Level::Locations) { - if (auto *DVI = dyn_cast(&I)) { + auto HandleDbgVariable = [&](auto *DbgVar) { if (!SP) - continue; + return; // Skip inlined variables. - if (I.getDebugLoc().getInlinedAt()) - continue; + if (DbgVar->getDebugLoc().getInlinedAt()) + return; // Skip undef values. - if (DVI->isKillLocation()) - continue; + if (DbgVar->isKillLocation()) + return; - auto *Var = DVI->getVariable(); + auto *Var = DbgVar->getVariable(); DebugInfoAfterPass.DIVariables[Var]++; - continue; - } + }; + for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) + HandleDbgVariable(&DVR); + if (auto *DVI = dyn_cast(&I)) + HandleDbgVariable(DVI); } // Skip debug instructions other than dbg.value and dbg.declare. @@ -675,16 +663,14 @@ bool llvm::checkDebugInfoMetadata(Module &M, // the debugging information from the previous pass. DebugInfoBeforePass = DebugInfoAfterPass; - if (NewDebugMode) - M.convertToNewDbgValues(); - LLVM_DEBUG(dbgs() << "\n\n"); return Result; } namespace { -/// Return true if a mis-sized diagnostic is issued for \p DVI. -bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) { +/// Return true if a mis-sized diagnostic is issued for \p DbgVal. +template +bool diagnoseMisSizedDbgValue(Module &M, DbgValTy *DbgVal) { // The size of a dbg.value's value operand should match the size of the // variable it corresponds to. // @@ -693,22 +679,22 @@ bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) { // For now, don't try to interpret anything more complicated than an empty // DIExpression. Eventually we should try to handle OP_deref and fragments. - if (DVI->getExpression()->getNumElements()) + if (DbgVal->getExpression()->getNumElements()) return false; - Value *V = DVI->getVariableLocationOp(0); + Value *V = DbgVal->getVariableLocationOp(0); if (!V) return false; Type *Ty = V->getType(); uint64_t ValueOperandSize = getAllocSizeInBits(M, Ty); - std::optional DbgVarSize = DVI->getFragmentSizeInBits(); + std::optional DbgVarSize = DbgVal->getFragmentSizeInBits(); if (!ValueOperandSize || !DbgVarSize) return false; bool HasBadSize = false; if (Ty->isIntegerTy()) { - auto Signedness = DVI->getVariable()->getSignedness(); + auto Signedness = DbgVal->getVariable()->getSignedness(); if (Signedness && *Signedness == DIBasicType::Signedness::Signed) HasBadSize = ValueOperandSize < *DbgVarSize; } else { @@ -718,7 +704,7 @@ bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) { if (HasBadSize) { dbg() << "ERROR: dbg.value operand has size " << ValueOperandSize << ", but its variable has size " << *DbgVarSize << ": "; - DVI->print(dbg()); + DbgVal->print(dbg()); dbg() << "\n"; } return HasBadSize; @@ -735,10 +721,6 @@ bool checkDebugifyMetadata(Module &M, return false; } - bool NewDebugMode = M.IsNewDbgInfoFormat; - if (NewDebugMode) - M.convertFromNewDbgValues(); - auto getDebugifyOperand = [&](unsigned Idx) -> unsigned { return mdconst::extract(NMD->getOperand(Idx)->getOperand(0)) ->getZExtValue(); @@ -780,18 +762,23 @@ bool checkDebugifyMetadata(Module &M, } // Find missing variables and mis-sized debug values. - for (Instruction &I : instructions(F)) { - auto *DVI = dyn_cast(&I); - if (!DVI) - continue; - + auto CheckForMisSized = [&](auto *DbgVal) { unsigned Var = ~0U; - (void)to_integer(DVI->getVariable()->getName(), Var, 10); + (void)to_integer(DbgVal->getVariable()->getName(), Var, 10); assert(Var <= OriginalNumVars && "Unexpected name for DILocalVariable"); - bool HasBadSize = diagnoseMisSizedDbgValue(M, DVI); + bool HasBadSize = diagnoseMisSizedDbgValue(M, DbgVal); if (!HasBadSize) MissingVars.reset(Var - 1); HasErrors |= HasBadSize; + }; + for (Instruction &I : instructions(F)) { + for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) + if (DVR.isDbgValue() || DVR.isDbgAssign()) + CheckForMisSized(&DVR); + auto *DVI = dyn_cast(&I); + if (!DVI) + continue; + CheckForMisSized(DVI); } } @@ -820,9 +807,6 @@ bool checkDebugifyMetadata(Module &M, if (Strip) Ret = stripDebugifyMetadata(M); - if (NewDebugMode) - M.convertToNewDbgValues(); - return Ret; } @@ -1052,10 +1036,6 @@ FunctionPass *createCheckDebugifyFunctionPass( PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M, ModuleAnalysisManager &) { - bool NewDebugMode = M.IsNewDbgInfoFormat; - if (NewDebugMode) - M.convertFromNewDbgValues(); - if (Mode == DebugifyMode::SyntheticDebugInfo) checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass, "CheckModuleDebugify", Strip, StatsMap); @@ -1065,9 +1045,6 @@ PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M, "CheckModuleDebugify (original debuginfo)", NameOfWrappedPass, OrigDIVerifyBugsReportFilePath); - if (NewDebugMode) - M.convertToNewDbgValues(); - return PreservedAnalyses::all(); } diff --git a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables-x.mir b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables-x.mir index eaa627966347fb..40ea01189f2cd9 100644 --- a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables-x.mir +++ b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables-x.mir @@ -1,5 +1,6 @@ # REQUIRES: x86-registered-target # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-check-debugify -o - %s 2>&1 | FileCheck %s +# RUN: llc --experimental-debuginfo-iterators=false -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-check-debugify -o - %s 2>&1 | FileCheck %s --- | ; ModuleID = 'check-line-and-variables.mir' source_filename = "check-line-and-variables.c" diff --git a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.ll b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.ll index 9033fd2f147c47..56c7cf45705a78 100644 --- a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.ll +++ b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.ll @@ -1,4 +1,5 @@ ; RUN: llc -debugify-check-and-strip-all-safe -o - %s 2>&1 | FileCheck %s +; RUN: llc --experimental-debuginfo-iterators=false -debugify-check-and-strip-all-safe -o - %s 2>&1 | FileCheck %s ; ModuleID = 'main.c' source_filename = "main.c" diff --git a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.mir b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.mir index 9eb722258b7031..0805a7f4cfc6ce 100644 --- a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.mir +++ b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.mir @@ -1,6 +1,8 @@ # REQUIRES: x86-registered-target # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-debugify,dead-mi-elimination,mir-check-debugify -o - %s 2>&1 | FileCheck %s # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-debugify,mir-check-debugify -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK-PASS +# RUN: llc --experimental-debuginfo-iterators=false -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-debugify,dead-mi-elimination,mir-check-debugify -o - %s 2>&1 | FileCheck %s +# RUN: llc --experimental-debuginfo-iterators=false -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-debugify,mir-check-debugify -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK-PASS --- | ; ModuleID = 'check-line-and-variables.mir' source_filename = "check-line-and-variables.ll" diff --git a/llvm/test/CodeGen/Generic/MIRDebugify/locations-and-values.mir b/llvm/test/CodeGen/Generic/MIRDebugify/locations-and-values.mir index 59dcff9efd4d54..3035fb8eab3f82 100644 --- a/llvm/test/CodeGen/Generic/MIRDebugify/locations-and-values.mir +++ b/llvm/test/CodeGen/Generic/MIRDebugify/locations-and-values.mir @@ -2,6 +2,10 @@ # RUN: llc -run-pass=mir-debugify -debugify-level=locations -o - %s | FileCheck --check-prefixes=ALL --implicit-check-not=dbg.value %s # RUN: llc -run-pass=mir-debugify,mir-strip-debug,mir-debugify -o - %s | FileCheck --check-prefixes=ALL,VALUE %s # RUN: llc -run-pass=mir-debugify,mir-strip-debug -o - %s | FileCheck --check-prefix=STRIP %s +# RUN: llc --experimental-debuginfo-iterators=false -run-pass=mir-debugify -o - %s | FileCheck --check-prefixes=ALL,VALUE %s +# RUN: llc --experimental-debuginfo-iterators=false -run-pass=mir-debugify -debugify-level=locations -o - %s | FileCheck --check-prefixes=ALL --implicit-check-not=dbg.value %s +# RUN: llc --experimental-debuginfo-iterators=false -run-pass=mir-debugify,mir-strip-debug,mir-debugify -o - %s | FileCheck --check-prefixes=ALL,VALUE %s +# RUN: llc --experimental-debuginfo-iterators=false -run-pass=mir-debugify,mir-strip-debug -o - %s | FileCheck --check-prefix=STRIP %s --- | ; ModuleID = 'loc-only.ll' diff --git a/llvm/test/CodeGen/Generic/MIRDebugify/multifunction-module.mir b/llvm/test/CodeGen/Generic/MIRDebugify/multifunction-module.mir index fe4fcc1a15bb82..8079db926e1b05 100644 --- a/llvm/test/CodeGen/Generic/MIRDebugify/multifunction-module.mir +++ b/llvm/test/CodeGen/Generic/MIRDebugify/multifunction-module.mir @@ -1,6 +1,5 @@ -# FIXME: Remove rm after a few weeks. -# RUN: rm -f %S/multifunction-module.s # RUN: llc -run-pass=mir-debugify,mir-check-debugify -o - %s 2>&1 | FileCheck %s +# RUN: llc --experimental-debuginfo-iterators=false -run-pass=mir-debugify,mir-check-debugify -o - %s 2>&1 | FileCheck %s # CHECK: Machine IR debug info check: PASS # CHECK-NOT: Assertion `Var <= NumVars && "Unexpected name for DILocalVariable"' diff --git a/llvm/test/DebugInfo/debugify-bogus-dbg-value.ll b/llvm/test/DebugInfo/debugify-bogus-dbg-value.ll index 4990979f10c53c..55e436b1a93b22 100644 --- a/llvm/test/DebugInfo/debugify-bogus-dbg-value.ll +++ b/llvm/test/DebugInfo/debugify-bogus-dbg-value.ll @@ -1,4 +1,5 @@ ; RUN: opt -passes=check-debugify < %s 2>&1 | FileCheck %s +; RUN: opt --experimental-debuginfo-iterators=false -passes=check-debugify < %s 2>&1 | FileCheck %s define <2 x i64> @test-fun(<2 x i64> %A) !dbg !6 { %and = and <2 x i64> %A, , !dbg !14 diff --git a/llvm/test/DebugInfo/debugify-each.ll b/llvm/test/DebugInfo/debugify-each.ll index e9241dedb69600..7685b57b5dd15f 100644 --- a/llvm/test/DebugInfo/debugify-each.ll +++ b/llvm/test/DebugInfo/debugify-each.ll @@ -40,6 +40,40 @@ ; RUN: opt -debugify-each -passes=globalopt -S -o /dev/null < %s 2> %t ; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS-ONE +; Repeat the same checks with debug intrinsics enabled. +; RUN: opt --experimental-debuginfo-iterators=false -debugify-each -O3 -S -o /dev/null < %s 2> %t +; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS +; RUN: FileCheck %s -input-file=%t -check-prefix=FUNCTION-PASS +; RUN: opt --experimental-debuginfo-iterators=false -disable-output -debugify-each -passes='default' %s 2> %t +; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS +; RUN: FileCheck %s -input-file=%t -check-prefix=FUNCTION-PASS + +; RUN: opt --experimental-debuginfo-iterators=false -enable-debugify -debugify-each -O3 -S -o /dev/null < %s 2> %t +; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS +; RUN: FileCheck %s -input-file=%t -check-prefix=FUNCTION-PASS + +; RUN: opt --experimental-debuginfo-iterators=false -debugify-each -passes='instrprof,instrprof,sroa,sccp' -S -o /dev/null < %s 2> %t +; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS +; RUN: FileCheck %s -input-file=%t -check-prefix=FUNCTION-PASS + +; RUN: opt --experimental-debuginfo-iterators=false -debugify-each -O1 < %s | opt -O2 -o /dev/null + +; RUN: opt --experimental-debuginfo-iterators=false -disable-output -debugify-quiet -debugify-each -O1 < %s 2>&1 | count 0 + +; RUN: opt --experimental-debuginfo-iterators=false -O1 < %s -S -o %t.before +; RUN: opt --experimental-debuginfo-iterators=false -O1 -debugify-each < %s -S -o %t.after +; RUN: diff %t.before %t.after + +; RUN: opt --experimental-debuginfo-iterators=false -O1 < %s | llvm-dis -o %t.before +; RUN: opt --experimental-debuginfo-iterators=false -O1 -debugify-each < %s | llvm-dis -o %t.after +; RUN: diff %t.before %t.after + +; RUN: opt --experimental-debuginfo-iterators=false -debugify-each -passes=instsimplify -S -o /dev/null < %s 2> %t +; RUN: FileCheck %s -input-file=%t -check-prefix=FUNCTION-PASS-ONE + +; RUN: opt --experimental-debuginfo-iterators=false -debugify-each -passes=globalopt -S -o /dev/null < %s 2> %t +; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS-ONE + define void @foo(i32 %arg) { call i32 asm "bswap $0", "=r,r"(i32 %arg) ret void diff --git a/llvm/test/DebugInfo/debugify-export.ll b/llvm/test/DebugInfo/debugify-export.ll index 6e5952d433da9b..30333ca908b0d2 100644 --- a/llvm/test/DebugInfo/debugify-export.ll +++ b/llvm/test/DebugInfo/debugify-export.ll @@ -1,6 +1,9 @@ ; RUN: opt %s -disable-output -debugify-each -debugify-quiet -debugify-export - -passes=globalopt | FileCheck %s ; RUN: opt %s -disable-output -debugify-each -debugify-quiet -debugify-export - -passes=globalopt | FileCheck %s +; RUN: opt --experimental-debuginfo-iterators=false %s -disable-output -debugify-each -debugify-quiet -debugify-export - -passes=globalopt | FileCheck %s +; RUN: opt --experimental-debuginfo-iterators=false %s -disable-output -debugify-each -debugify-quiet -debugify-export - -passes=globalopt | FileCheck %s + ; CHECK: Pass Name ; CHECK-SAME: # of missing debug values ; CHECK-SAME: # of missing locations diff --git a/llvm/test/DebugInfo/debugify-ignore-phi.ll b/llvm/test/DebugInfo/debugify-ignore-phi.ll index 322ccafa22ac81..643df1d9604852 100644 --- a/llvm/test/DebugInfo/debugify-ignore-phi.ll +++ b/llvm/test/DebugInfo/debugify-ignore-phi.ll @@ -1,4 +1,5 @@ ; RUN: opt -passes=check-debugify < %s -S 2>&1 | FileCheck %s +; RUN: opt --experimental-debuginfo-iterators=false -passes=check-debugify < %s -S 2>&1 | FileCheck %s define void @test_phi(i1 %cond) !dbg !6 { br i1 %cond, label %1, label %2, !dbg !11 diff --git a/llvm/test/DebugInfo/debugify-original-no-dbg-info.ll b/llvm/test/DebugInfo/debugify-original-no-dbg-info.ll index 941b294fb85567..4cbbfc5c215e28 100644 --- a/llvm/test/DebugInfo/debugify-original-no-dbg-info.ll +++ b/llvm/test/DebugInfo/debugify-original-no-dbg-info.ll @@ -1,4 +1,5 @@ ; RUN: opt -verify-debuginfo-preserve -passes=instcombine -S -o - < %s 2>&1 | FileCheck %s +; RUN: opt --experimental-debuginfo-iterators=false -verify-debuginfo-preserve -passes=instcombine -S -o - < %s 2>&1 | FileCheck %s ; CHECK: ModuleDebugify (original debuginfo): Skipping module without debug info ; CHECK-NEXT: CheckModuleDebugify (original debuginfo): Skipping module without debug info diff --git a/llvm/test/DebugInfo/debugify-report-missing-locs-only.ll b/llvm/test/DebugInfo/debugify-report-missing-locs-only.ll index 1c5daa19c64841..04b7636f025a07 100644 --- a/llvm/test/DebugInfo/debugify-report-missing-locs-only.ll +++ b/llvm/test/DebugInfo/debugify-report-missing-locs-only.ll @@ -1,4 +1,5 @@ ; RUN: opt -passes=check-debugify < %s -S -o - 2>&1 | FileCheck %s -implicit-check-not "WARNING: Instruction with empty DebugLoc in function bar" +; RUN: opt --experimental-debuginfo-iterators=false -passes=check-debugify < %s -S -o - 2>&1 | FileCheck %s -implicit-check-not "WARNING: Instruction with empty DebugLoc in function bar" ; CHECK: WARNING: Instruction with empty DebugLoc in function foo -- ret void define void @foo() !dbg !6 { diff --git a/llvm/test/DebugInfo/debugify.ll b/llvm/test/DebugInfo/debugify.ll index 5ce6795d41b6bf..191015f8259339 100644 --- a/llvm/test/DebugInfo/debugify.ll +++ b/llvm/test/DebugInfo/debugify.ll @@ -25,6 +25,33 @@ ; RUN: opt -enable-debugify -O1 < %s | opt -O2 -o /dev/null ; RUN: opt -passes=debugify,mem2reg,check-debugify < %s | opt -O2 -o /dev/null +;; Perform the same checks again for intrinsic debug info +; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify -S -o - < %s | FileCheck %s +; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify -S -o - < %s | FileCheck %s + +; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,debugify -S -o - < %s 2>&1 | \ +; RUN: FileCheck %s -check-prefix=CHECK-REPEAT +; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,debugify -S -o - < %s 2>&1 | \ +; RUN: FileCheck %s -check-prefix=CHECK-REPEAT + +; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,check-debugify -S -o - < %s | \ +; RUN: FileCheck %s -implicit-check-not="CheckModuleDebugify: FAIL" +; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,check-debugify -S -o - < %s | \ +; RUN: FileCheck %s -implicit-check-not="CheckModuleDebugify: FAIL" +; RUN: opt --experimental-debuginfo-iterators=false -enable-debugify -passes=verify -S -o - < %s | \ +; RUN: FileCheck %s -implicit-check-not="CheckModuleDebugify: FAIL" + +; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,strip,check-debugify -S -o - < %s 2>&1 | \ +; RUN: FileCheck %s -check-prefix=CHECK-WARN + +; RUN: opt --experimental-debuginfo-iterators=false -enable-debugify -passes=strip -S -o - < %s 2>&1 | \ +; RUN: FileCheck %s -check-prefix=CHECK-WARN + +; RUN: opt --experimental-debuginfo-iterators=false -enable-debugify -S -o - < %s 2>&1 | FileCheck %s -check-prefix=PASS + +; RUN: opt --experimental-debuginfo-iterators=false -enable-debugify -O1 < %s | opt -O2 -o /dev/null +; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,mem2reg,check-debugify < %s | opt -O2 -o /dev/null + ; CHECK-LABEL: define void @foo define void @foo() { ; CHECK: ret void, !dbg ![[RET1:.*]] diff --git a/llvm/test/DebugInfo/pr37964.ll b/llvm/test/DebugInfo/pr37964.ll index 9581f1a6b35dc5..63db67d2bd37f7 100644 --- a/llvm/test/DebugInfo/pr37964.ll +++ b/llvm/test/DebugInfo/pr37964.ll @@ -1,4 +1,5 @@ ; RUN: opt -disable-output -debugify-each -passes=gvn < %s 2>&1 | FileCheck %s +; RUN: opt --experimental-debuginfo-iterators=false -disable-output -debugify-each -passes=gvn < %s 2>&1 | FileCheck %s ; CHECK-NOT: ERROR: Instruction with empty DebugLoc in function _Z3bazv -- {{%.*}} = phi ; CHECK: CheckFunctionDebugify [GVNPass]: PASS diff --git a/llvm/test/DebugInfo/salvage-cast-debug-info.ll b/llvm/test/DebugInfo/salvage-cast-debug-info.ll index 4676aee3d4e480..b72f717a4f2de7 100644 --- a/llvm/test/DebugInfo/salvage-cast-debug-info.ll +++ b/llvm/test/DebugInfo/salvage-cast-debug-info.ll @@ -1,5 +1,5 @@ ; RUN: opt %s -passes=debugify,early-cse -earlycse-debug-hash -S | FileCheck %s -; RUN: opt %s -passes=debugify,early-cse -earlycse-debug-hash -S --try-experimental-debuginfo-iterators | FileCheck %s +; RUN: opt --experimental-debuginfo-iterators=false %s -passes=debugify,early-cse -earlycse-debug-hash -S | FileCheck %s define i32 @foo(i64 %nose, i32 %more) { ; CHECK-LABEL: @foo( ; CHECK: call void @llvm.dbg.value(metadata i64 %nose, metadata [[V1:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 64, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned diff --git a/llvm/test/DebugInfo/verify-di-preserve.ll b/llvm/test/DebugInfo/verify-di-preserve.ll index a2f1b1dd78dc5a..92fc62a0b34c47 100644 --- a/llvm/test/DebugInfo/verify-di-preserve.ll +++ b/llvm/test/DebugInfo/verify-di-preserve.ll @@ -1,10 +1,10 @@ ; RUN: opt %s -verify-debuginfo-preserve -passes=instcombine -disable-output 2>&1 | FileCheck --check-prefix=VERIFY %s -; RUN: opt --try-experimental-debuginfo-iterators %s -verify-debuginfo-preserve -passes=instcombine -disable-output 2>&1 | FileCheck --check-prefix=VERIFY %s +; RUN: opt --experimental-debuginfo-iterators=false %s -verify-debuginfo-preserve -passes=instcombine -disable-output 2>&1 | FileCheck --check-prefix=VERIFY %s ; VERIFY: CheckModuleDebugify (original debuginfo): ; RUN: opt %s -verify-each-debuginfo-preserve -O2 -disable-output 2>&1 | FileCheck --check-prefix=VERIFY-EACH %s -; RUN: opt %s --try-experimental-debuginfo-iterators -verify-each-debuginfo-preserve -O2 -disable-output 2>&1 | FileCheck --check-prefix=VERIFY-EACH %s +; RUN: opt %s --experimental-debuginfo-iterators=false -verify-each-debuginfo-preserve -O2 -disable-output 2>&1 | FileCheck --check-prefix=VERIFY-EACH %s ; VERIFY-EACH: DeadArgumentEliminationPass ; VERIFY-EACH: GlobalDCEPass From 45eabd1362527d1b3a27a90f7479865785b763ee Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Wed, 17 Apr 2024 00:52:15 +0800 Subject: [PATCH 126/300] [flang,test] Add -resource-dir option to msvc-dependent-lib-flags.f90 (#88894) For aarch64-windows-msvc, clang_rt.builtins is placed in windows subdir instead of triple subdir, and the name of clang_rt.builtins is clang_rt.builtins-aarch64.lib. So let's use `-resource-dir` option to fix test failure. Please see talk for PR#87866. --- flang/test/Driver/msvc-dependent-lib-flags.f90 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flang/test/Driver/msvc-dependent-lib-flags.f90 b/flang/test/Driver/msvc-dependent-lib-flags.f90 index 643dbe9e949cbb..6cfc969e92b20a 100644 --- a/flang/test/Driver/msvc-dependent-lib-flags.f90 +++ b/flang/test/Driver/msvc-dependent-lib-flags.f90 @@ -1,7 +1,7 @@ -! RUN: %flang -### --target=aarch64-windows-msvc %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC -! RUN: %flang -### --target=aarch64-windows-msvc -fms-runtime-lib=static_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DEBUG -! RUN: %flang -### --target=aarch64-windows-msvc -fms-runtime-lib=dll %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL -! RUN: %flang -### --target=aarch64-windows-msvc -fms-runtime-lib=dll_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL-DEBUG +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=static_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DEBUG +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL +! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL-DEBUG ! MSVC: -fc1 ! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib From 22e6bf77ad8781810fc81fff4c447c03cdf6f419 Mon Sep 17 00:00:00 2001 From: Volodymyr Sapsai Date: Tue, 16 Apr 2024 10:12:26 -0700 Subject: [PATCH 127/300] [unused-includes][Serialization] Remove unused includes. NFC. (#88790) --- clang/include/clang/Serialization/ModuleFileExtension.h | 1 - clang/lib/Serialization/ASTReader.cpp | 1 - clang/lib/Serialization/ASTWriterDecl.cpp | 1 - clang/lib/Serialization/ASTWriterStmt.cpp | 1 - clang/lib/Serialization/GeneratePCH.cpp | 1 - clang/lib/Serialization/GlobalModuleIndex.cpp | 1 - clang/lib/Serialization/ModuleFileExtension.cpp | 2 +- clang/lib/Serialization/PCHContainerOperations.cpp | 2 -- 8 files changed, 1 insertion(+), 9 deletions(-) diff --git a/clang/include/clang/Serialization/ModuleFileExtension.h b/clang/include/clang/Serialization/ModuleFileExtension.h index d7d456c8b5db8e..50ce401516275c 100644 --- a/clang/include/clang/Serialization/ModuleFileExtension.h +++ b/clang/include/clang/Serialization/ModuleFileExtension.h @@ -9,7 +9,6 @@ #ifndef LLVM_CLANG_SERIALIZATION_MODULEFILEEXTENSION_H #define LLVM_CLANG_SERIALIZATION_MODULEFILEEXTENSION_H -#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/Support/ExtensibleRTTI.h" #include "llvm/Support/HashBuilder.h" #include "llvm/Support/MD5.h" diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index cf0726460bfca7..b28df03b4a95e9 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -31,7 +31,6 @@ #include "clang/AST/ExternalASTSource.h" #include "clang/AST/NestedNameSpecifier.h" #include "clang/AST/ODRDiagsEmitter.h" -#include "clang/AST/ODRHash.h" #include "clang/AST/OpenACCClause.h" #include "clang/AST/OpenMPClause.h" #include "clang/AST/RawCommentList.h" diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index d0d49bcdf991a9..c6db107e0ca429 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -16,7 +16,6 @@ #include "clang/AST/DeclTemplate.h" #include "clang/AST/DeclVisitor.h" #include "clang/AST/Expr.h" -#include "clang/AST/ODRHash.h" #include "clang/AST/OpenMPClause.h" #include "clang/AST/PrettyDeclStackTrace.h" #include "clang/Basic/SourceManager.h" diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index e3816181e2b2b9..a736a7b0ef726c 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -19,7 +19,6 @@ #include "clang/AST/ExprOpenMP.h" #include "clang/AST/StmtVisitor.h" #include "clang/Lex/Token.h" -#include "clang/Sema/DeclSpec.h" #include "clang/Serialization/ASTRecordWriter.h" #include "llvm/Bitstream/BitstreamWriter.h" using namespace clang; diff --git a/clang/lib/Serialization/GeneratePCH.cpp b/clang/lib/Serialization/GeneratePCH.cpp index 2fece29f34487e..bed74399098d7f 100644 --- a/clang/lib/Serialization/GeneratePCH.cpp +++ b/clang/lib/Serialization/GeneratePCH.cpp @@ -17,7 +17,6 @@ #include "clang/Lex/HeaderSearchOptions.h" #include "clang/Lex/Preprocessor.h" #include "clang/Sema/SemaConsumer.h" -#include "clang/Serialization/ASTReader.h" #include "clang/Serialization/ASTWriter.h" #include "llvm/Bitstream/BitstreamWriter.h" diff --git a/clang/lib/Serialization/GlobalModuleIndex.cpp b/clang/lib/Serialization/GlobalModuleIndex.cpp index 8ff10f6a8621e8..f09ceb8d316206 100644 --- a/clang/lib/Serialization/GlobalModuleIndex.cpp +++ b/clang/lib/Serialization/GlobalModuleIndex.cpp @@ -13,7 +13,6 @@ #include "clang/Serialization/GlobalModuleIndex.h" #include "ASTReaderInternals.h" #include "clang/Basic/FileManager.h" -#include "clang/Lex/HeaderSearch.h" #include "clang/Serialization/ASTBitCodes.h" #include "clang/Serialization/ModuleFile.h" #include "clang/Serialization/PCHContainerOperations.h" diff --git a/clang/lib/Serialization/ModuleFileExtension.cpp b/clang/lib/Serialization/ModuleFileExtension.cpp index 95fff41e0d7a85..729529b5fca18c 100644 --- a/clang/lib/Serialization/ModuleFileExtension.cpp +++ b/clang/lib/Serialization/ModuleFileExtension.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// #include "clang/Serialization/ModuleFileExtension.h" -#include "llvm/ADT/Hashing.h" + using namespace clang; char ModuleFileExtension::ID = 0; diff --git a/clang/lib/Serialization/PCHContainerOperations.cpp b/clang/lib/Serialization/PCHContainerOperations.cpp index 56ca3394385b4f..4aedb7debcff28 100644 --- a/clang/lib/Serialization/PCHContainerOperations.cpp +++ b/clang/lib/Serialization/PCHContainerOperations.cpp @@ -12,8 +12,6 @@ #include "clang/Serialization/PCHContainerOperations.h" #include "clang/AST/ASTConsumer.h" -#include "clang/Lex/ModuleLoader.h" -#include "llvm/Bitstream/BitstreamReader.h" #include "llvm/Support/raw_ostream.h" #include From b566810add5b7c5695bdd2c39710b78af9dc83ba Mon Sep 17 00:00:00 2001 From: Volodymyr Sapsai Date: Tue, 16 Apr 2024 10:13:15 -0700 Subject: [PATCH 128/300] [unused-includes] PCHContainerOperations uses MemoryBufferRef, not MemoryBuffer. NFC. (#88794) --- clang/include/clang/Serialization/PCHContainerOperations.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Serialization/PCHContainerOperations.h b/clang/include/clang/Serialization/PCHContainerOperations.h index ddfddf2dafadf9..c9a7e334ce6eb3 100644 --- a/clang/include/clang/Serialization/PCHContainerOperations.h +++ b/clang/include/clang/Serialization/PCHContainerOperations.h @@ -12,7 +12,7 @@ #include "clang/Basic/Module.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" -#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/MemoryBufferRef.h" #include namespace llvm { From 454d4496970f665200b5b300578894d78405b6ca Mon Sep 17 00:00:00 2001 From: Xing Xue Date: Tue, 16 Apr 2024 13:13:49 -0400 Subject: [PATCH 129/300] [OpenMP] Use a memory fence before incrementing the dispatch buffer index (#87995) This patch uses a memory fence in function `__kmp_dispatch_next()` to flush pending memory write invalidates before incrementing the `volatile` variable `buffer_index` to fix intermittent time-outs of OpenMP runtime LIT test cases `env/kmp_set_dispatch_buf.c` and `worksharing/for/kmp_set_dispatch_buf.c`, noting that the same is needed for incrementing `buffer_index` in function `__kmpc_next_section()` (line 2600 of `kmp_dispatch.cpp`). --- openmp/runtime/src/kmp_dispatch.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp index ac85b2b3f2fcd4..fc333765118179 100644 --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -2397,6 +2397,8 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, sh->u.s.ordered_iteration = 0; } + KMP_MB(); /* Flush all pending memory write invalidates. */ + sh->buffer_index += __kmp_dispatch_num_buffers; KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", gtid, sh->buffer_index)); From 22bba85d82637d0446928ff6ff41f98583f3d3b2 Mon Sep 17 00:00:00 2001 From: Xing Xue Date: Tue, 16 Apr 2024 13:14:29 -0400 Subject: [PATCH 130/300] [OpenMP][test][AIX] Make 64 the max number of threads for capacity tests in AIX 32-bit (#88739) This patch makes 64 the max number of threads for 2 capacity tests in AIX 32-bit mode rather than `XFAIL`ing them. --- .../capacity_mix_threads.cpp | 20 +++++++++++++------ .../hidden_helper_task/capacity_nthreads.cpp | 20 +++++++++++++------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp index 3f2ceef0c4add4..36825dbebafb51 100644 --- a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp +++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp @@ -1,7 +1,4 @@ // RUN: %libomp-cxx-compile-and-run -// -// AIX runs out of resource in 32-bit with 4*omp_get_max_threads() threads. -// XFAIL: aix && ppc #include @@ -11,6 +8,12 @@ #include #include +// AIX runs out of resource in 32-bit if 4*omp_get_max_threads() is more +// than 64 threads with the default stack size. +#if defined(_AIX) && !__LP64__ +#define MAX_THREADS 64 +#endif + void dummy_root() { // omp_get_max_threads() will do middle initialization int nthreads = omp_get_max_threads(); @@ -18,9 +21,14 @@ void dummy_root() { } int main(int argc, char *argv[]) { - const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()), - 4 * omp_get_num_procs()), - std::numeric_limits::max()); + int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()), + 4 * omp_get_num_procs()), + std::numeric_limits::max()); + +#if defined(_AIX) && !__LP64__ + if (N > MAX_THREADS) + N = MAX_THREADS; +#endif std::vector data(N); diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp index f7405d00255cb9..1cceee95e704b8 100644 --- a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp +++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp @@ -1,7 +1,4 @@ // RUN: %libomp-cxx-compile-and-run -// -// AIX runs out of resource in 32-bit with 4*omp_get_max_threads() threads. -// XFAIL: aix && ppc #include @@ -10,10 +7,21 @@ #include #include +// AIX runs out of resource in 32-bit if 4*omp_get_max_threads() is more +// than 64 threads with the default stacksize. +#if defined(_AIX) && !__LP64__ +#define MAX_THREADS 64 +#endif + int main(int argc, char *argv[]) { - const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()), - 4 * omp_get_num_procs()), - std::numeric_limits::max()); + int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()), + 4 * omp_get_num_procs()), + std::numeric_limits::max()); + +#if defined(_AIX) && !__LP64__ + if (N > MAX_THREADS) + N = MAX_THREADS; +#endif std::vector data(N); From 8137bd9e03d636a27701a85b6efe899f9571cac5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 16 Apr 2024 10:16:48 -0700 Subject: [PATCH 131/300] [memprof] Use CSId to construct MemProfRecord (#88362) We are in the process of referring to call stacks with CallStackId in IndexedMemProfRecord and IndexedAllocationInfo instead of holding call stacks inline (both in memory and the serialized format). Doing so deduplicates call stacks and reduces the MemProf profile file size. Before we can eliminate the two fields holding call stacks inline: - IndexedAllocationInfo::CallStack - IndexedMemProfRecord::CallSites we need to eliminate all the read operations on them. This patch is a step toward that direction. Specifically, we eliminate the read operations in the context of MemProfReader and RawMemProfReader. A subsequent patch will eliminate the read operations during the serialization. --- llvm/include/llvm/ProfileData/MemProf.h | 8 ++ llvm/include/llvm/ProfileData/MemProfReader.h | 20 ++++- llvm/lib/ProfileData/MemProf.cpp | 18 ++++ llvm/lib/ProfileData/MemProfReader.cpp | 26 ++++++ llvm/unittests/ProfileData/MemProfTest.cpp | 85 +++++++++++++++++++ 5 files changed, 153 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index 0431c182276ec6..3520034fb1c946 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -16,6 +16,8 @@ namespace llvm { namespace memprof { +struct MemProfRecord; + // The versions of the indexed MemProf format enum IndexedVersion : uint64_t { // Version 0: This version didn't have a version field. @@ -392,6 +394,12 @@ struct IndexedMemProfRecord { const unsigned char *Buffer, IndexedVersion Version); + // Convert IndexedMemProfRecord to MemProfRecord. Callback is used to + // translate CallStackId to call stacks with frames inline. + MemProfRecord toMemProfRecord( + std::function(const CallStackId)> Callback) + const; + // Returns the GUID for the function name after canonicalization. For // memprof, we remove any .llvm suffix added by LTO. MemProfRecords are // mapped to functions using this GUID. diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h index 89f49a20a6089f..1f84fefad03e39 100644 --- a/llvm/include/llvm/ProfileData/MemProfReader.h +++ b/llvm/include/llvm/ProfileData/MemProfReader.h @@ -70,8 +70,20 @@ class MemProfReader { Callback = std::bind(&MemProfReader::idToFrame, this, std::placeholders::_1); + auto CallStackCallback = [&](CallStackId CSId) { + llvm::SmallVector CallStack; + auto Iter = CSIdToCallStack.find(CSId); + assert(Iter != CSIdToCallStack.end()); + for (FrameId Id : Iter->second) + CallStack.push_back(Callback(Id)); + return CallStack; + }; + const IndexedMemProfRecord &IndexedRecord = Iter->second; - GuidRecord = {Iter->first, MemProfRecord(IndexedRecord, Callback)}; + GuidRecord = { + Iter->first, + IndexedRecord.toMemProfRecord(CallStackCallback), + }; Iter++; return Error::success(); } @@ -84,9 +96,7 @@ class MemProfReader { // Initialize the MemProfReader with the frame mappings and profile contents. MemProfReader( llvm::DenseMap FrameIdMap, - llvm::MapVector ProfData) - : IdToFrame(std::move(FrameIdMap)), - FunctionProfileData(std::move(ProfData)) {} + llvm::MapVector ProfData); protected: // A helper method to extract the frame from the IdToFrame map. @@ -97,6 +107,8 @@ class MemProfReader { } // A mapping from FrameId (a hash of the contents) to the frame. llvm::DenseMap IdToFrame; + // A mapping from CallStackId to the call stack. + llvm::DenseMap> CSIdToCallStack; // A mapping from function GUID, hash of the canonical function symbol to the // memprof profile data for that function, i.e allocation and callsite info. llvm::MapVector FunctionProfileData; diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp index 97414505f1c134..1ca0a02d3cbde1 100644 --- a/llvm/lib/ProfileData/MemProf.cpp +++ b/llvm/lib/ProfileData/MemProf.cpp @@ -224,6 +224,24 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema, llvm_unreachable("unsupported MemProf version"); } +MemProfRecord IndexedMemProfRecord::toMemProfRecord( + std::function(const CallStackId)> Callback) + const { + MemProfRecord Record; + + for (const memprof::IndexedAllocationInfo &IndexedAI : AllocSites) { + memprof::AllocationInfo AI; + AI.Info = IndexedAI.Info; + AI.CallStack = Callback(IndexedAI.CSId); + Record.AllocSites.push_back(AI); + } + + for (memprof::CallStackId CSId : CallSiteIds) + Record.CallSites.push_back(Callback(CSId)); + + return Record; +} + GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) { // Canonicalize the function name to drop suffixes such as ".llvm.". Note // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp index 580867a9083fde..91556f036c7771 100644 --- a/llvm/lib/ProfileData/MemProfReader.cpp +++ b/llvm/lib/ProfileData/MemProfReader.cpp @@ -183,6 +183,28 @@ std::string getBuildIdString(const SegmentEntry &Entry) { } } // namespace +MemProfReader::MemProfReader( + llvm::DenseMap FrameIdMap, + llvm::MapVector ProfData) + : IdToFrame(std::move(FrameIdMap)), + FunctionProfileData(std::move(ProfData)) { + // Populate CSId in each IndexedAllocationInfo and IndexedMemProfRecord + // while storing CallStack in CSIdToCallStack. + for (auto &KV : FunctionProfileData) { + IndexedMemProfRecord &Record = KV.second; + for (auto &AS : Record.AllocSites) { + CallStackId CSId = hashCallStack(AS.CallStack); + AS.CSId = CSId; + CSIdToCallStack.insert({CSId, AS.CallStack}); + } + for (auto &CS : Record.CallSites) { + CallStackId CSId = hashCallStack(CS); + Record.CallSiteIds.push_back(CSId); + CSIdToCallStack.insert({CSId, CS}); + } + } +} + Expected> RawMemProfReader::create(const Twine &Path, const StringRef ProfiledBinary, bool KeepName) { @@ -445,6 +467,7 @@ Error RawMemProfReader::mapRawProfileToRecords() { } CallStackId CSId = hashCallStack(Callstack); + CSIdToCallStack.insert({CSId, Callstack}); // We attach the memprof record to each function bottom-up including the // first non-inline frame. @@ -467,7 +490,10 @@ Error RawMemProfReader::mapRawProfileToRecords() { auto Result = FunctionProfileData.insert({Id, IndexedMemProfRecord()}); IndexedMemProfRecord &Record = Result.first->second; for (LocationPtr Loc : Locs) { + CallStackId CSId = hashCallStack(*Loc); + CSIdToCallStack.insert({CSId, *Loc}); Record.CallSites.push_back(*Loc); + Record.CallSiteIds.push_back(CSId); } } diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp index 9cf307472d656e..ab9227e9df881b 100644 --- a/llvm/unittests/ProfileData/MemProfTest.cpp +++ b/llvm/unittests/ProfileData/MemProfTest.cpp @@ -21,9 +21,11 @@ using ::llvm::DILineInfo; using ::llvm::DILineInfoSpecifier; using ::llvm::DILocal; using ::llvm::StringRef; +using ::llvm::memprof::CallStackId; using ::llvm::memprof::CallStackMap; using ::llvm::memprof::Frame; using ::llvm::memprof::FrameId; +using ::llvm::memprof::IndexedAllocationInfo; using ::llvm::memprof::IndexedMemProfRecord; using ::llvm::memprof::MemInfoBlock; using ::llvm::memprof::MemProfReader; @@ -36,6 +38,7 @@ using ::llvm::memprof::SegmentEntry; using ::llvm::object::SectionedAddress; using ::llvm::symbolize::SymbolizableModule; using ::testing::Return; +using ::testing::SizeIs; class MockSymbolizer : public SymbolizableModule { public: @@ -432,4 +435,86 @@ TEST(MemProf, BaseMemProfReader) { EXPECT_THAT(Records[0].AllocSites[0].CallStack[1], FrameContains("bar", 10U, 2U, false)); } + +TEST(MemProf, IndexedMemProfRecordToMemProfRecord) { + // Verify that MemProfRecord can be constructed from IndexedMemProfRecord with + // CallStackIds only. + + llvm::DenseMap FrameIdMap; + Frame F1(1, 0, 0, false); + Frame F2(2, 0, 0, false); + Frame F3(3, 0, 0, false); + Frame F4(4, 0, 0, false); + FrameIdMap.insert({F1.hash(), F1}); + FrameIdMap.insert({F2.hash(), F2}); + FrameIdMap.insert({F3.hash(), F3}); + FrameIdMap.insert({F4.hash(), F4}); + + llvm::DenseMap> CallStackIdMap; + llvm::SmallVector CS1 = {F1.hash(), F2.hash()}; + llvm::SmallVector CS2 = {F1.hash(), F3.hash()}; + llvm::SmallVector CS3 = {F2.hash(), F3.hash()}; + llvm::SmallVector CS4 = {F2.hash(), F4.hash()}; + CallStackIdMap.insert({llvm::memprof::hashCallStack(CS1), CS1}); + CallStackIdMap.insert({llvm::memprof::hashCallStack(CS2), CS2}); + CallStackIdMap.insert({llvm::memprof::hashCallStack(CS3), CS3}); + CallStackIdMap.insert({llvm::memprof::hashCallStack(CS4), CS4}); + + IndexedMemProfRecord IndexedRecord; + IndexedAllocationInfo AI; + AI.CSId = llvm::memprof::hashCallStack(CS1); + IndexedRecord.AllocSites.push_back(AI); + AI.CSId = llvm::memprof::hashCallStack(CS2); + IndexedRecord.AllocSites.push_back(AI); + IndexedRecord.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS3)); + IndexedRecord.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS4)); + + bool CSIdMissing = false; + bool FrameIdMissing = false; + + auto Callback = [&](CallStackId CSId) -> llvm::SmallVector { + llvm::SmallVector CallStack; + llvm::SmallVector FrameIds; + + auto Iter = CallStackIdMap.find(CSId); + if (Iter == CallStackIdMap.end()) + CSIdMissing = true; + else + FrameIds = Iter->second; + + for (FrameId Id : FrameIds) { + Frame F(0, 0, 0, false); + auto Iter = FrameIdMap.find(Id); + if (Iter == FrameIdMap.end()) + FrameIdMissing = true; + else + F = Iter->second; + CallStack.push_back(F); + } + + return CallStack; + }; + + MemProfRecord Record = IndexedRecord.toMemProfRecord(Callback); + + // Make sure that all lookups are successful. + ASSERT_FALSE(CSIdMissing); + ASSERT_FALSE(FrameIdMissing); + + // Verify the contents of Record. + ASSERT_THAT(Record.AllocSites, SizeIs(2)); + ASSERT_THAT(Record.AllocSites[0].CallStack, SizeIs(2)); + EXPECT_EQ(Record.AllocSites[0].CallStack[0].hash(), F1.hash()); + EXPECT_EQ(Record.AllocSites[0].CallStack[1].hash(), F2.hash()); + ASSERT_THAT(Record.AllocSites[1].CallStack, SizeIs(2)); + EXPECT_EQ(Record.AllocSites[1].CallStack[0].hash(), F1.hash()); + EXPECT_EQ(Record.AllocSites[1].CallStack[1].hash(), F3.hash()); + ASSERT_THAT(Record.CallSites, SizeIs(2)); + ASSERT_THAT(Record.CallSites[0], SizeIs(2)); + EXPECT_EQ(Record.CallSites[0][0].hash(), F2.hash()); + EXPECT_EQ(Record.CallSites[0][1].hash(), F3.hash()); + ASSERT_THAT(Record.CallSites[1], SizeIs(2)); + EXPECT_EQ(Record.CallSites[1][0].hash(), F2.hash()); + EXPECT_EQ(Record.CallSites[1][1].hash(), F4.hash()); +} } // namespace From 8cd8ebe153391993a3668d2ac8d2994d9491f3ef Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 16 Apr 2024 10:25:02 -0700 Subject: [PATCH 132/300] [MLGO] Use double comparison facilities for reg alloc scoring tests (#88862) This patch switches from using direct equality (ASSERT_EQ) to the floating point comparison facilities (ASSERT_DOUBLE_EQ) within google test to avoid weird floating point problems. There is at least one downstream that maintains a patch for issues cropping up from the direct equality. https://gitlab.alpinelinux.org/alpine/aports/-/blob/master/main/llvm17/allocscore.patch --- llvm/unittests/CodeGen/RegAllocScoreTest.cpp | 27 ++++++++++---------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/llvm/unittests/CodeGen/RegAllocScoreTest.cpp b/llvm/unittests/CodeGen/RegAllocScoreTest.cpp index ff7146eaf9439a..eae517f9d01cf2 100644 --- a/llvm/unittests/CodeGen/RegAllocScoreTest.cpp +++ b/llvm/unittests/CodeGen/RegAllocScoreTest.cpp @@ -166,19 +166,20 @@ TEST(RegAllocScoreTest, Counts) { ASSERT_EQ(MF->size(), 2U); const auto TotalScore = llvm::calculateRegAllocScore(*MF, MBBFreqMock, IsRemat); - ASSERT_EQ(Freq1, TotalScore.copyCounts()); - ASSERT_EQ(2.0 * Freq1 + Freq2, TotalScore.loadCounts()); - ASSERT_EQ(Freq1 + Freq2, TotalScore.storeCounts()); - ASSERT_EQ(Freq2, TotalScore.loadStoreCounts()); - ASSERT_EQ(Freq1, TotalScore.cheapRematCounts()); - ASSERT_EQ(Freq2, TotalScore.expensiveRematCounts()); - ASSERT_EQ(TotalScore.getScore(), - TotalScore.copyCounts() * CopyWeight + - TotalScore.loadCounts() * LoadWeight + - TotalScore.storeCounts() * StoreWeight + - TotalScore.loadStoreCounts() * (LoadWeight + StoreWeight) + - TotalScore.cheapRematCounts() * CheapRematWeight + - TotalScore.expensiveRematCounts() * ExpensiveRematWeight + ASSERT_DOUBLE_EQ(Freq1, TotalScore.copyCounts()); + ASSERT_DOUBLE_EQ(2.0 * Freq1 + Freq2, TotalScore.loadCounts()); + ASSERT_DOUBLE_EQ(Freq1 + Freq2, TotalScore.storeCounts()); + ASSERT_DOUBLE_EQ(Freq2, TotalScore.loadStoreCounts()); + ASSERT_DOUBLE_EQ(Freq1, TotalScore.cheapRematCounts()); + ASSERT_DOUBLE_EQ(Freq2, TotalScore.expensiveRematCounts()); + ASSERT_DOUBLE_EQ( + TotalScore.getScore(), + TotalScore.copyCounts() * CopyWeight + + TotalScore.loadCounts() * LoadWeight + + TotalScore.storeCounts() * StoreWeight + + TotalScore.loadStoreCounts() * (LoadWeight + StoreWeight) + + TotalScore.cheapRematCounts() * CheapRematWeight + + TotalScore.expensiveRematCounts() * ExpensiveRematWeight ); } From 75054525ae58f26c86e418382164540760871186 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timm=20B=C3=A4der?= Date: Tue, 16 Apr 2024 18:42:23 +0200 Subject: [PATCH 133/300] [clang][Interp] Implement VectorSplat casts --- clang/lib/AST/Interp/ByteCodeExprGen.cpp | 29 ++++++++++++++++++++++++ clang/lib/AST/Interp/ByteCodeExprGen.h | 9 +++++++- clang/test/AST/Interp/vectors.cpp | 17 ++++++++++++-- 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 00c4a9f161304a..6b4b51aac41e84 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -398,6 +398,35 @@ bool ByteCodeExprGen::VisitCastExpr(const CastExpr *CE) { return true; } + case CK_VectorSplat: { + assert(!classify(CE->getType())); + assert(classify(SubExpr->getType())); + assert(CE->getType()->isVectorType()); + + if (DiscardResult) + return this->discard(SubExpr); + + assert(Initializing); // FIXME: Not always correct. + const auto *VT = CE->getType()->getAs(); + PrimType ElemT = classifyPrim(SubExpr); + unsigned ElemOffset = allocateLocalPrimitive( + SubExpr, ElemT, /*IsConst=*/true, /*IsExtended=*/false); + + if (!this->visit(SubExpr)) + return false; + if (!this->emitSetLocal(ElemT, ElemOffset, CE)) + return false; + + for (unsigned I = 0; I != VT->getNumElements(); ++I) { + if (!this->emitGetLocal(ElemT, ElemOffset, CE)) + return false; + if (!this->emitInitElem(ElemT, I, CE)) + return false; + } + + return true; + } + case CK_ToVoid: return discard(SubExpr); diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h index db0d73ce23f7c4..7e9dc8631fc0d3 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.h +++ b/clang/lib/AST/Interp/ByteCodeExprGen.h @@ -148,13 +148,20 @@ class ByteCodeExprGen : public ConstStmtVisitor, bool>, return Ctx.classify(Ty); } - /// Classifies a known primitive type + /// Classifies a known primitive type. PrimType classifyPrim(QualType Ty) const { if (auto T = classify(Ty)) { return *T; } llvm_unreachable("not a primitive type"); } + /// Classifies a known primitive expression. + PrimType classifyPrim(const Expr *E) const { + if (auto T = classify(E)) + return *T; + llvm_unreachable("not a primitive type"); + } + /// Evaluates an expression and places the result on the stack. If the /// expression is of composite type, a local variable will be created /// and a pointer to said variable will be placed on the stack. diff --git a/clang/test/AST/Interp/vectors.cpp b/clang/test/AST/Interp/vectors.cpp index 6c5d916f51f563..5c4694f122d812 100644 --- a/clang/test/AST/Interp/vectors.cpp +++ b/clang/test/AST/Interp/vectors.cpp @@ -1,10 +1,23 @@ // RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s // RUN: %clang_cc1 -verify=ref,both %s -// ref-no-diagnostics - typedef int __attribute__((vector_size(16))) VI4; constexpr VI4 A = {1,2,3,4}; +static_assert(A[0] == 1, ""); // ref-error {{not an integral constant expression}} +static_assert(A[1] == 2, ""); // ref-error {{not an integral constant expression}} +static_assert(A[2] == 3, ""); // ref-error {{not an integral constant expression}} +static_assert(A[3] == 4, ""); // ref-error {{not an integral constant expression}} + +/// VectorSplat casts +typedef __attribute__(( ext_vector_type(4) )) float float4; +constexpr float4 vec4_0 = (float4)0.5f; +static_assert(vec4_0[0] == 0.5, ""); // ref-error {{not an integral constant expression}} +static_assert(vec4_0[1] == 0.5, ""); // ref-error {{not an integral constant expression}} +static_assert(vec4_0[2] == 0.5, ""); // ref-error {{not an integral constant expression}} +static_assert(vec4_0[3] == 0.5, ""); // ref-error {{not an integral constant expression}} +constexpr int vec4_0_discarded = ((float4)12.0f, 0); + + /// From constant-expression-cxx11.cpp namespace Vector { From 184ba038ac1d444980b3e554b0057f3f30c516ab Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 16 Apr 2024 10:46:27 -0700 Subject: [PATCH 134/300] [RISCV] Avoid matching 3/5/9 * 2^N as 2^N + 2/4/8 (e.g. 24) (#88937) The former is better as a zero extend can be folded into the sll, whereas the later currently produces a seperate zext.w due to bad interactions with other combines. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 ++++++ llvm/test/CodeGen/RISCV/addimm-mulimm.ll | 9 ++++----- llvm/test/CodeGen/RISCV/rv64zba.ll | 22 +++++++++++++++++++++ 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 765838aafb58d2..de2ad639f0d6c8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13416,6 +13416,12 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, return SDValue(); uint64_t MulAmt = CNode->getZExtValue(); + // 3/5/9 * 2^N -> shXadd (sll X, C), (sll X, C) + // Matched in tablegen, avoid perturbing patterns. + for (uint64_t Divisor : {3, 5, 9}) + if (MulAmt % Divisor == 0 && isPowerOf2_64(MulAmt / Divisor)) + return SDValue(); + // If this is a power 2 + 2/4/8, we can use a shift followed by a single // shXadd. First check if this a sum of two power of 2s because that's // easy. Then count how many zeros are up to the first bit. diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll index 10103f071462c5..48fa69e1045656 100644 --- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll +++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll @@ -551,9 +551,8 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) { ; RV32IMB-NEXT: sh3add a1, a1, a2 ; RV32IMB-NEXT: sh1add a0, a0, a0 ; RV32IMB-NEXT: slli a2, a0, 3 -; RV32IMB-NEXT: li a3, 1 -; RV32IMB-NEXT: slli a3, a3, 11 -; RV32IMB-NEXT: sh3add a0, a0, a3 +; RV32IMB-NEXT: addi a0, a2, 2047 +; RV32IMB-NEXT: addi a0, a0, 1 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret @@ -562,8 +561,8 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) { ; RV64IMB: # %bb.0: ; RV64IMB-NEXT: addi a0, a0, 86 ; RV64IMB-NEXT: sh1add a0, a0, a0 -; RV64IMB-NEXT: slli a0, a0, 3 -; RV64IMB-NEXT: addi a0, a0, -16 +; RV64IMB-NEXT: li a1, -16 +; RV64IMB-NEXT: sh3add a0, a0, a1 ; RV64IMB-NEXT: ret %tmp0 = mul i64 %x, 24 %tmp1 = add i64 %tmp0, 2048 diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index a84b9e5e7962f6..c3c757656be933 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -2490,3 +2490,25 @@ define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) { %gep2 = getelementptr i64, ptr %gep1, i64 %a1 ret ptr %gep2 } + +define i64 @regression(i32 signext %x, i32 signext %y) { +; RV64I-LABEL: regression: +; RV64I: # %bb.0: +; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: slli a0, a0, 32 +; RV64I-NEXT: li a1, 3 +; RV64I-NEXT: slli a1, a1, 35 +; RV64I-NEXT: mulhu a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: regression: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: slli.uw a0, a0, 3 +; RV64ZBA-NEXT: sh1add a0, a0, a0 +; RV64ZBA-NEXT: ret + %sub = sub i32 %x, %y + %ext = zext i32 %sub to i64 + %res = mul nuw nsw i64 %ext, 24 + ret i64 %res +} From 4082a7554521572a65a5a0008c4661a534df659d Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Tue, 16 Apr 2024 13:48:13 -0400 Subject: [PATCH 135/300] Improve stack usage to increase recursive initialization depth (#88546) We were crashing due to stack exhaustion on rather reasonable C++ template code. After some investigation, I found that we have a stack-allocated object that was huge: `InitializationSequence` was 7016 bytes. This caused an overflow with deep call stacks in initialization code. With these change, `InitializationSequence` is now 248 bytes. With the original code, testing RelWithDebInfo on Windows 10, all the tests in SemaCXX took about 6s 800ms. The max template depth I could reach on my machine using the code in the issue was 708. After that, I would get `-Wstack-exhausted` warnings until crashing at 976 instantiations. With these changes on the same machine, all the tests in SemaCXX took about 6s 500ms. The max template depth I could reach was 1492. After that, I would get `-Wstack-exhausted` warnings until crashing at 2898 instantiations. This improves the behavior of #88330 but there's still an outstanding question of why we run out of stack space and crash in some circumstances before we're able to issue a diagnostic about stack space exhaustion. --- clang/docs/ReleaseNotes.rst | 6 ++ clang/include/clang/Sema/Initialization.h | 6 +- clang/include/clang/Sema/Overload.h | 70 ++++++----------------- clang/lib/Sema/SemaInit.cpp | 26 +++++---- clang/lib/Sema/SemaOverload.cpp | 21 +++---- 5 files changed, 54 insertions(+), 75 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d8ec8bcb8df532..e6c345a2f5c0f5 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -203,6 +203,12 @@ Non-comprehensive list of changes in this release - ``__typeof_unqual__`` is available in all C modes as an extension, which behaves like ``typeof_unqual`` from C23, similar to ``__typeof__`` and ``typeof``. +- Improved stack usage with C++ initialization code. This allows significantly + more levels of recursive initialization before reaching stack exhaustion + limits. This will positively impact recursive template instantiation code, + but should also reduce memory overhead for initializations in general. + Fixes #GH88330 + New Compiler Flags ------------------ - ``-fsanitize=implicit-bitfield-conversion`` checks implicit truncation and diff --git a/clang/include/clang/Sema/Initialization.h b/clang/include/clang/Sema/Initialization.h index 2072cd8d1c3ef8..1ceacf0f49f568 100644 --- a/clang/include/clang/Sema/Initialization.h +++ b/clang/include/clang/Sema/Initialization.h @@ -1134,7 +1134,7 @@ class InitializationSequence { OverloadingResult FailedOverloadResult; /// The candidate set created when initialization failed. - OverloadCandidateSet FailedCandidateSet; + std::unique_ptr FailedCandidateSet; /// The incomplete type that caused a failure. QualType FailedIncompleteType; @@ -1403,7 +1403,9 @@ class InitializationSequence { /// Retrieve a reference to the candidate set when overload /// resolution fails. OverloadCandidateSet &getFailedCandidateSet() { - return FailedCandidateSet; + assert(FailedCandidateSet && + "this should have been allocated in the constructor!"); + return *FailedCandidateSet; } /// Get the overloading result, for when the initialization diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index 76311b00d2fc58..e6f88bbf7c4f47 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -37,6 +37,7 @@ #include #include #include +#include #include namespace clang { @@ -874,7 +875,8 @@ class Sema; ConversionFixItGenerator Fix; /// Viable - True to indicate that this overload candidate is viable. - bool Viable : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned Viable : 1; /// Whether this candidate is the best viable function, or tied for being /// the best viable function. @@ -883,12 +885,14 @@ class Sema; /// was part of the ambiguity kernel: the minimal non-empty set of viable /// candidates such that all elements of the ambiguity kernel are better /// than all viable candidates not in the ambiguity kernel. - bool Best : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned Best : 1; /// IsSurrogate - True to indicate that this candidate is a /// surrogate for a conversion to a function pointer or reference /// (C++ [over.call.object]). - bool IsSurrogate : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned IsSurrogate : 1; /// IgnoreObjectArgument - True to indicate that the first /// argument's conversion, which for this function represents the @@ -897,18 +901,20 @@ class Sema; /// implicit object argument is just a placeholder) or a /// non-static member function when the call doesn't have an /// object argument. - bool IgnoreObjectArgument : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned IgnoreObjectArgument : 1; /// True if the candidate was found using ADL. - CallExpr::ADLCallKind IsADLCandidate : 1; + LLVM_PREFERRED_TYPE(CallExpr::ADLCallKind) + unsigned IsADLCandidate : 1; /// Whether this is a rewritten candidate, and if so, of what kind? LLVM_PREFERRED_TYPE(OverloadCandidateRewriteKind) unsigned RewriteKind : 2; /// FailureKind - The reason why this candidate is not viable. - /// Actually an OverloadFailureKind. - unsigned char FailureKind; + LLVM_PREFERRED_TYPE(OverloadFailureKind) + unsigned FailureKind : 5; /// The number of call arguments that were explicitly provided, /// to be used while performing partial ordering of function templates. @@ -972,7 +978,9 @@ class Sema; private: friend class OverloadCandidateSet; OverloadCandidate() - : IsSurrogate(false), IsADLCandidate(CallExpr::NotADL), RewriteKind(CRK_None) {} + : IsSurrogate(false), + IsADLCandidate(static_cast(CallExpr::NotADL)), + RewriteKind(CRK_None) {} }; /// OverloadCandidateSet - A set of overload candidates, used in C++ @@ -1070,51 +1078,16 @@ class Sema; }; private: - SmallVector Candidates; - llvm::SmallPtrSet Functions; - - // Allocator for ConversionSequenceLists. We store the first few of these - // inline to avoid allocation for small sets. - llvm::BumpPtrAllocator SlabAllocator; + SmallVector Candidates; + llvm::SmallPtrSet Functions; SourceLocation Loc; CandidateSetKind Kind; OperatorRewriteInfo RewriteInfo; - constexpr static unsigned NumInlineBytes = - 24 * sizeof(ImplicitConversionSequence); - unsigned NumInlineBytesUsed = 0; - alignas(void *) char InlineSpace[NumInlineBytes]; - // Address space of the object being constructed. LangAS DestAS = LangAS::Default; - /// If we have space, allocates from inline storage. Otherwise, allocates - /// from the slab allocator. - /// FIXME: It would probably be nice to have a SmallBumpPtrAllocator - /// instead. - /// FIXME: Now that this only allocates ImplicitConversionSequences, do we - /// want to un-generalize this? - template - T *slabAllocate(unsigned N) { - // It's simpler if this doesn't need to consider alignment. - static_assert(alignof(T) == alignof(void *), - "Only works for pointer-aligned types."); - static_assert(std::is_trivial::value || - std::is_same::value, - "Add destruction logic to OverloadCandidateSet::clear()."); - - unsigned NBytes = sizeof(T) * N; - if (NBytes > NumInlineBytes - NumInlineBytesUsed) - return SlabAllocator.Allocate(N); - char *FreeSpaceStart = InlineSpace + NumInlineBytesUsed; - assert(uintptr_t(FreeSpaceStart) % alignof(void *) == 0 && - "Misaligned storage!"); - - NumInlineBytesUsed += NBytes; - return reinterpret_cast(FreeSpaceStart); - } - void destroyCandidates(); public: @@ -1163,12 +1136,7 @@ class Sema; ConversionSequenceList allocateConversionSequences(unsigned NumConversions) { ImplicitConversionSequence *Conversions = - slabAllocate(NumConversions); - - // Construct the new objects. - for (unsigned I = 0; I != NumConversions; ++I) - new (&Conversions[I]) ImplicitConversionSequence(); - + new ImplicitConversionSequence[NumConversions]; return ConversionSequenceList(Conversions, NumConversions); } diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index fb7a80ab02846c..791c0b6e6df23e 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -6114,7 +6114,8 @@ InitializationSequence::InitializationSequence( Sema &S, const InitializedEntity &Entity, const InitializationKind &Kind, MultiExprArg Args, bool TopLevelOfInitList, bool TreatUnavailableAsInvalid) : FailedOverloadResult(OR_Success), - FailedCandidateSet(Kind.getLocation(), OverloadCandidateSet::CSK_Normal) { + FailedCandidateSet(new OverloadCandidateSet( + Kind.getLocation(), OverloadCandidateSet::CSK_Normal)) { InitializeFrom(S, Entity, Kind, Args, TopLevelOfInitList, TreatUnavailableAsInvalid); } @@ -9735,7 +9736,7 @@ bool InitializationSequence::Diagnose(Sema &S, switch (FailedOverloadResult) { case OR_Ambiguous: - FailedCandidateSet.NoteCandidates( + FailedCandidateSet->NoteCandidates( PartialDiagnosticAt( Kind.getLocation(), Failure == FK_UserConversionOverloadFailed @@ -9749,7 +9750,8 @@ bool InitializationSequence::Diagnose(Sema &S, break; case OR_No_Viable_Function: { - auto Cands = FailedCandidateSet.CompleteCandidates(S, OCD_AllCandidates, Args); + auto Cands = + FailedCandidateSet->CompleteCandidates(S, OCD_AllCandidates, Args); if (!S.RequireCompleteType(Kind.getLocation(), DestType.getNonReferenceType(), diag::err_typecheck_nonviable_condition_incomplete, @@ -9759,13 +9761,13 @@ bool InitializationSequence::Diagnose(Sema &S, << OnlyArg->getType() << Args[0]->getSourceRange() << DestType.getNonReferenceType(); - FailedCandidateSet.NoteCandidates(S, Args, Cands); + FailedCandidateSet->NoteCandidates(S, Args, Cands); break; } case OR_Deleted: { OverloadCandidateSet::iterator Best; - OverloadingResult Ovl - = FailedCandidateSet.BestViableFunction(S, Kind.getLocation(), Best); + OverloadingResult Ovl = + FailedCandidateSet->BestViableFunction(S, Kind.getLocation(), Best); StringLiteral *Msg = Best->Function->getDeletedMessage(); S.Diag(Kind.getLocation(), diag::err_typecheck_deleted_function) @@ -9949,7 +9951,7 @@ bool InitializationSequence::Diagnose(Sema &S, // bad. switch (FailedOverloadResult) { case OR_Ambiguous: - FailedCandidateSet.NoteCandidates( + FailedCandidateSet->NoteCandidates( PartialDiagnosticAt(Kind.getLocation(), S.PDiag(diag::err_ovl_ambiguous_init) << DestType << ArgsRange), @@ -10003,7 +10005,7 @@ bool InitializationSequence::Diagnose(Sema &S, break; } - FailedCandidateSet.NoteCandidates( + FailedCandidateSet->NoteCandidates( PartialDiagnosticAt( Kind.getLocation(), S.PDiag(diag::err_ovl_no_viable_function_in_init) @@ -10013,8 +10015,8 @@ bool InitializationSequence::Diagnose(Sema &S, case OR_Deleted: { OverloadCandidateSet::iterator Best; - OverloadingResult Ovl - = FailedCandidateSet.BestViableFunction(S, Kind.getLocation(), Best); + OverloadingResult Ovl = + FailedCandidateSet->BestViableFunction(S, Kind.getLocation(), Best); if (Ovl != OR_Deleted) { S.Diag(Kind.getLocation(), diag::err_ovl_deleted_init) << DestType << ArgsRange; @@ -10093,8 +10095,8 @@ bool InitializationSequence::Diagnose(Sema &S, S.Diag(Kind.getLocation(), diag::err_selected_explicit_constructor) << Args[0]->getSourceRange(); OverloadCandidateSet::iterator Best; - OverloadingResult Ovl - = FailedCandidateSet.BestViableFunction(S, Kind.getLocation(), Best); + OverloadingResult Ovl = + FailedCandidateSet->BestViableFunction(S, Kind.getLocation(), Best); (void)Ovl; assert(Ovl == OR_Success && "Inconsistent overload resolution"); CXXConstructorDecl *CtorDecl = cast(Best->Function); diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 227ef564ba3e08..bcde0d86cf10fd 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1057,8 +1057,7 @@ bool OverloadCandidateSet::OperatorRewriteInfo::shouldAddReversed( void OverloadCandidateSet::destroyCandidates() { for (iterator i = begin(), e = end(); i != e; ++i) { - for (auto &C : i->Conversions) - C.~ImplicitConversionSequence(); + delete[] i->Conversions.data(); if (!i->Viable && i->FailureKind == ovl_fail_bad_deduction) i->DeductionFailure.Destroy(); } @@ -1066,8 +1065,6 @@ void OverloadCandidateSet::destroyCandidates() { void OverloadCandidateSet::clear(CandidateSetKind CSK) { destroyCandidates(); - SlabAllocator.Reset(); - NumInlineBytesUsed = 0; Candidates.clear(); Functions.clear(); Kind = CSK; @@ -6983,7 +6980,7 @@ void Sema::AddOverloadCandidate( Candidate.RewriteKind = CandidateSet.getRewriteInfo().getRewriteKind(Function, PO); Candidate.IsSurrogate = false; - Candidate.IsADLCandidate = IsADLCandidate; + Candidate.IsADLCandidate = static_cast(IsADLCandidate); Candidate.IgnoreObjectArgument = false; Candidate.ExplicitCallArguments = Args.size(); @@ -7815,7 +7812,7 @@ void Sema::AddTemplateOverloadCandidate( Candidate.RewriteKind = CandidateSet.getRewriteInfo().getRewriteKind(Candidate.Function, PO); Candidate.IsSurrogate = false; - Candidate.IsADLCandidate = IsADLCandidate; + Candidate.IsADLCandidate = static_cast(IsADLCandidate); // Ignore the object argument if there is one, since we don't have an object // type. Candidate.IgnoreObjectArgument = @@ -14125,7 +14122,8 @@ static ExprResult FinishOverloadedCallExpr(Sema &SemaRef, Scope *S, Expr *Fn, return ExprError(); return SemaRef.BuildResolvedCallExpr( Res.get(), FDecl, LParenLoc, Args, RParenLoc, ExecConfig, - /*IsExecConfig=*/false, (*Best)->IsADLCandidate); + /*IsExecConfig=*/false, + static_cast((*Best)->IsADLCandidate)); } case OR_No_Viable_Function: { @@ -14184,7 +14182,8 @@ static ExprResult FinishOverloadedCallExpr(Sema &SemaRef, Scope *S, Expr *Fn, return ExprError(); return SemaRef.BuildResolvedCallExpr( Res.get(), FDecl, LParenLoc, Args, RParenLoc, ExecConfig, - /*IsExecConfig=*/false, (*Best)->IsADLCandidate); + /*IsExecConfig=*/false, + static_cast((*Best)->IsADLCandidate)); } } @@ -14491,7 +14490,8 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc, Args[0] = Input; CallExpr *TheCall = CXXOperatorCallExpr::Create( Context, Op, FnExpr.get(), ArgsArray, ResultTy, VK, OpLoc, - CurFPFeatureOverrides(), Best->IsADLCandidate); + CurFPFeatureOverrides(), + static_cast(Best->IsADLCandidate)); if (CheckCallReturnType(FnDecl->getReturnType(), OpLoc, TheCall, FnDecl)) return ExprError(); @@ -14909,7 +14909,8 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc, // members; CodeGen should take care not to emit the this pointer. TheCall = CXXOperatorCallExpr::Create( Context, ChosenOp, FnExpr.get(), Args, ResultTy, VK, OpLoc, - CurFPFeatureOverrides(), Best->IsADLCandidate); + CurFPFeatureOverrides(), + static_cast(Best->IsADLCandidate)); if (const auto *Method = dyn_cast(FnDecl); Method && Method->isImplicitObjectMemberFunction()) { From aefff774a0d6f75565243263555f2513ac3c9fdf Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Tue, 16 Apr 2024 21:50:22 +0400 Subject: [PATCH 136/300] [clang] Migrate DR tests to `static_assert` (#88611) This patch touches a number of tests that run in C++98 mode that have been using array size as a context that requires a constant expression, replacing it with a `static_assert` backported via a macro. This reduces noise in expected directives that comes from diagnostics around VLAs. This patch also showcases that DR tests would benefit from folding in constant expressions in C++98 mode, but I'm not sure it's even on the table. If it is, I'd be happy to prepare a PR for that, and rebase this PR on top of it. CC @AaronBallman --- clang/test/CXX/drs/dr0xx.cpp | 14 +++++--- clang/test/CXX/drs/dr16xx.cpp | 5 ++- clang/test/CXX/drs/dr1xx.cpp | 61 +++++++++++++++++++---------------- clang/test/CXX/drs/dr2xx.cpp | 15 ++++++--- clang/test/CXX/drs/dr3xx.cpp | 38 ++++++++++++---------- clang/test/CXX/drs/dr4xx.cpp | 60 +++++++++++++++++----------------- clang/test/CXX/drs/dr5xx.cpp | 13 +++++--- clang/test/CXX/drs/dr6xx.cpp | 6 ++-- 8 files changed, 117 insertions(+), 95 deletions(-) diff --git a/clang/test/CXX/drs/dr0xx.cpp b/clang/test/CXX/drs/dr0xx.cpp index a304862885c640..6c600bbc7c3f6e 100644 --- a/clang/test/CXX/drs/dr0xx.cpp +++ b/clang/test/CXX/drs/dr0xx.cpp @@ -5,6 +5,11 @@ // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple +#if __cplusplus == 199711L +#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) +// cxx98-error@-1 {{variadic macros are a C99 feature}} +#endif + namespace cwg1 { // cwg1: no namespace X { extern "C" void cwg1_f(int a = 1); } namespace Y { extern "C" void cwg1_f(int a = 1); } @@ -897,7 +902,7 @@ namespace cwg54 { // cwg54: 2.8 namespace cwg55 { // cwg55: yes enum E { e = 5 }; - int test[(e + 1 == 6) ? 1 : -1]; + static_assert(e + 1 == 6, ""); } namespace cwg56 { // cwg56: yes @@ -1163,10 +1168,9 @@ namespace cwg75 { // cwg75: yes namespace cwg76 { // cwg76: yes const volatile int n = 1; - int arr[n]; // #cwg76-vla - // expected-error@#cwg76-vla {{variable length arrays in C++ are a Clang extension}} - // expected-note@#cwg76-vla {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}} - // expected-error@#cwg76-vla {{variable length array declaration not allowed at file scope}} + static_assert(n, ""); + // expected-error@-1 {{static assertion expression is not an integral constant expression}} + // expected-note@-2 {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}} } namespace cwg77 { // cwg77: yes diff --git a/clang/test/CXX/drs/dr16xx.cpp b/clang/test/CXX/drs/dr16xx.cpp index 6d7bb7619f8b8b..cf6b45ceabf2cc 100644 --- a/clang/test/CXX/drs/dr16xx.cpp +++ b/clang/test/CXX/drs/dr16xx.cpp @@ -153,10 +153,9 @@ namespace cwg1645 { // cwg1645: 3.9 namespace cwg1652 { // cwg1652: 3.6 int a, b; - int arr[&a + 1 == &b ? 1 : 2]; - // expected-error@-1 {{variable length arrays in C++ are a Clang extension}} + static_assert(&a + 1 == &b, ""); + // expected-error@-1 {{static assertion expression is not an integral constant expression}} // expected-note@-2 {{comparison against pointer '&a + 1' that points past the end of a complete object has unspecified value}} - // expected-error@-3 {{variable length array declaration not allowed at file scope}} } namespace cwg1653 { // cwg1653: 4 c++17 diff --git a/clang/test/CXX/drs/dr1xx.cpp b/clang/test/CXX/drs/dr1xx.cpp index 5b497dda047d6a..a8f9b705a98660 100644 --- a/clang/test/CXX/drs/dr1xx.cpp +++ b/clang/test/CXX/drs/dr1xx.cpp @@ -5,6 +5,17 @@ // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors +#if __cplusplus == 199711L +#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) +// cxx98-error@-1 {{variadic macros are a C99 feature}} +#endif + +#if __cplusplus == 199711L +#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x)) +#else +#define __enable_constant_folding +#endif + namespace cwg100 { // cwg100: yes template struct A {}; // #cwg100-A template struct B {}; // #cwg100-B @@ -736,8 +747,8 @@ namespace cwg147 { // cwg147: yes namespace cwg148 { // cwg148: yes struct A { int A::*p; }; - int check1[__is_pod(int(A::*)) ? 1 : -1]; - int check2[__is_pod(A) ? 1 : -1]; + static_assert(__is_pod(int(A::*)), ""); + static_assert(__is_pod(A), ""); } // cwg149: na @@ -745,13 +756,7 @@ namespace cwg148 { // cwg148: yes namespace cwg151 { // cwg151: 3.1 struct X {}; typedef int X::*p; -#if __cplusplus < 201103L -#define fold(x) (__builtin_constant_p(0) ? (x) : (x)) -#else -#define fold -#endif - int check[fold(p() == 0) ? 1 : -1]; -#undef fold + static_assert(__enable_constant_folding(p() == 0), ""); } namespace cwg152 { // cwg152: yes @@ -956,42 +961,42 @@ namespace cwg171 { namespace cwg172 { // cwg172: yes enum { zero }; - int check1[-1 < zero ? 1 : -1]; + static_assert(-1 < zero, ""); enum { x = -1, y = (unsigned int)-1 }; - int check2[sizeof(x) > sizeof(int) ? 1 : -1]; + static_assert(sizeof(x) > sizeof(int), ""); enum { a = (unsigned int)-1 / 2 }; - int check3a[sizeof(a) == sizeof(int) ? 1 : -1]; - int check3b[-a < 0 ? 1 : -1]; + static_assert(sizeof(a) == sizeof(int), ""); + static_assert(-a < 0, ""); enum { b = (unsigned int)-1 / 2 + 1 }; - int check4a[sizeof(b) == sizeof(unsigned int) ? 1 : -1]; - int check4b[-b > 0 ? 1 : -1]; + static_assert(sizeof(b) == sizeof(unsigned int), ""); + static_assert(-b > 0, ""); enum { c = (unsigned long)-1 / 2 }; - int check5a[sizeof(c) == sizeof(long) ? 1 : -1]; - int check5b[-c < 0 ? 1 : -1]; + static_assert(sizeof(c) == sizeof(long), ""); + static_assert(-c < 0, ""); enum { d = (unsigned long)-1 / 2 + 1 }; - int check6a[sizeof(d) == sizeof(unsigned long) ? 1 : -1]; - int check6b[-d > 0 ? 1 : -1]; + static_assert(sizeof(d) == sizeof(unsigned long), ""); + static_assert(-d > 0, ""); enum { e = (unsigned long long)-1 / 2 }; // cxx98-error@-1 {{'long long' is a C++11 extension}} - int check7a[sizeof(e) == sizeof(long) ? 1 : -1]; - int check7b[-e < 0 ? 1 : -1]; + static_assert(sizeof(e) == sizeof(long), ""); + static_assert(-e < 0, ""); enum { f = (unsigned long long)-1 / 2 + 1 }; // cxx98-error@-1 {{'long long' is a C++11 extension}} - int check8a[sizeof(f) == sizeof(unsigned long) ? 1 : -1]; - int check8b[-f > 0 ? 1 : -1]; + static_assert(sizeof(f) == sizeof(unsigned long), ""); + static_assert(-f > 0, ""); } namespace cwg173 { // cwg173: yes - int check[('0' + 1 == '1' && '0' + 2 == '2' && '0' + 3 == '3' && - '0' + 4 == '4' && '0' + 5 == '5' && '0' + 6 == '6' && - '0' + 7 == '7' && '0' + 8 == '8' && '0' + 9 == '9') ? 1 : -1]; + static_assert('0' + 1 == '1' && '0' + 2 == '2' && '0' + 3 == '3' && + '0' + 4 == '4' && '0' + 5 == '5' && '0' + 6 == '6' && + '0' + 7 == '7' && '0' + 8 == '8' && '0' + 9 == '9', ""); } // cwg174: sup 1012 @@ -1070,7 +1075,7 @@ namespace cwg177 { // cwg177: yes } namespace cwg178 { // cwg178: yes - int check[int() == 0 ? 1 : -1]; + static_assert(int() == 0, ""); #if __cplusplus >= 201103L static_assert(int{} == 0, ""); struct S { int a, b; }; @@ -1180,7 +1185,7 @@ namespace cwg187 { // cwg187: sup 481 namespace cwg188 { // cwg188: yes char c[10]; - int check[sizeof(0, c) == 10 ? 1 : -1]; + static_assert(sizeof(0, c) == 10, ""); } // cwg190 FIXME: add codegen test for tbaa diff --git a/clang/test/CXX/drs/dr2xx.cpp b/clang/test/CXX/drs/dr2xx.cpp index e655e7226d51d6..5d3e8ce4bea3bc 100644 --- a/clang/test/CXX/drs/dr2xx.cpp +++ b/clang/test/CXX/drs/dr2xx.cpp @@ -10,10 +10,15 @@ typedef __SIZE_TYPE__ size_t; // cxx98-error@-1 0-1 {{'long long' is a C++11 extension}} -#if __cplusplus < 201103L -#define fold(x) (__builtin_constant_p(x) ? (x) : (x)) +#if __cplusplus == 199711L +#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) +// cxx98-error@-1 {{variadic macros are a C99 feature}} +#endif + +#if __cplusplus == 199711L +#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x)) #else -#define fold +#define __enable_constant_folding #endif namespace cwg200 { // cwg200: dup 214 @@ -31,7 +36,7 @@ namespace cwg200 { // cwg200: dup 214 namespace cwg202 { // cwg202: 3.1 template T f(); template struct X { - int arr[fold(g == &f) ? 1 : -1]; + static_assert(__enable_constant_folding(g == &f), ""); }; template struct X; } @@ -1024,7 +1029,7 @@ namespace cwg275 { // cwg275: no namespace cwg277 { // cwg277: 3.1 typedef int *intp; int *p = intp(); - int a[fold(intp() ? -1 : 1)]; + static_assert(__enable_constant_folding(!intp()), ""); } namespace cwg280 { // cwg280: 2.9 diff --git a/clang/test/CXX/drs/dr3xx.cpp b/clang/test/CXX/drs/dr3xx.cpp index 6d1c6958ac8eb6..3e9228fe21fb64 100644 --- a/clang/test/CXX/drs/dr3xx.cpp +++ b/clang/test/CXX/drs/dr3xx.cpp @@ -5,6 +5,17 @@ // RUN: %clang_cc1 -std=c++11 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx11-14,since-cxx11 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++98 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx98 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors +#if __cplusplus == 199711L +#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) +// cxx98-error@-1 {{variadic macros are a C99 feature}} +#endif + +#if __cplusplus == 199711L +#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x)) +#else +#define __enable_constant_folding +#endif + namespace cwg300 { // cwg300: yes template void f(R (&)(A)) {} int g(int); @@ -396,7 +407,7 @@ namespace cwg324 { // cwg324: 3.6 namespace cwg326 { // cwg326: 3.1 struct S {}; - int test[__is_trivially_constructible(S, const S&) ? 1 : -1]; + static_assert(__is_trivially_constructible(S, const S&), ""); } namespace cwg327 { // cwg327: dup 538 @@ -653,7 +664,7 @@ namespace cwg339 { // cwg339: 2.8 template A make_A(); - int a[conv_int::value ? 1 : -1]; + static_assert(conv_int::value, ""); bool b = conv_int2(A<1>()); A<1> c = make_A(); } @@ -1099,21 +1110,14 @@ namespace cwg364 { // cwg364: yes #endif namespace cwg367 { // cwg367: yes - // FIXME: These diagnostics are terrible. Don't diagnose an ill-formed global - // array as being a VLA! - int a[true ? throw 0 : 4]; - // expected-error@-1 {{variable length arrays in C++ are a Clang extension}} - // expected-error@-2 {{variable length array declaration not allowed at file scope}} - int b[true ? 4 : throw 0]; - // cxx98-error@-1 {{variable length arrays in C++ are a Clang extension}} - // cxx98-error@-2 {{variable length array folded to constant array as an extension}} - int c[true ? *new int : 4]; - // expected-error@-1 {{variable length arrays in C++ are a Clang extension}} + static_assert(__enable_constant_folding(true ? throw 0 : 4), ""); + // expected-error@-1 {{expression is not an integral constant expression}} + static_assert(__enable_constant_folding(true ? 4 : throw 0), ""); + static_assert(__enable_constant_folding(true ? *new int : 4), ""); + // expected-error@-1 {{expression is not an integral constant expression}} // expected-note@-2 {{read of uninitialized object is not allowed in a constant expression}} - // expected-error@-3 {{variable length array declaration not allowed at file scope}} - int d[true ? 4 : *new int]; - // cxx98-error@-1 {{variable length arrays in C++ are a Clang extension}} - // cxx98-error@-2 {{variable length array folded to constant array as an extension}} + static_assert(__enable_constant_folding(true ? 4 : *new int), ""); + } namespace cwg368 { // cwg368: 3.6 @@ -1325,7 +1329,7 @@ namespace cwg383 { // cwg383: yes struct B { ~B(); }; union C { C &operator=(const C&); }; union D { ~D(); }; - int check[(__is_pod(A) || __is_pod(B) || __is_pod(C) || __is_pod(D)) ? -1 : 1]; + static_assert(!__is_pod(A) && !__is_pod(B) && !__is_pod(C) && !__is_pod(D), ""); } namespace cwg384 { // cwg384: yes diff --git a/clang/test/CXX/drs/dr4xx.cpp b/clang/test/CXX/drs/dr4xx.cpp index 611b791470785d..07162cc28f6b60 100644 --- a/clang/test/CXX/drs/dr4xx.cpp +++ b/clang/test/CXX/drs/dr4xx.cpp @@ -6,6 +6,11 @@ // RUN: env ASAN_OPTIONS=detect_stack_use_after_return=0 %clang_cc1 -std=c++23 %s -verify=expected,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: env ASAN_OPTIONS=detect_stack_use_after_return=0 %clang_cc1 -std=c++2c %s -verify=expected,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +#if __cplusplus == 199711L +#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) +// cxx98-error@-1 {{variadic macros are a C99 feature}} +#endif + // FIXME: __SIZE_TYPE__ expands to 'long long' on some targets. __extension__ typedef __SIZE_TYPE__ size_t; @@ -217,7 +222,7 @@ namespace cwg407 { // cwg407: 3.8 } namespace cwg408 { // cwg408: 3.4 - template void g() { int arr[N != 1 ? 1 : -1]; } + template void g() { static_assert(N != 1, ""); } template<> void g<2>() { } template struct S { @@ -239,7 +244,7 @@ namespace cwg408 { // cwg408: 3.4 }; template int R::arr[1]; template void R::f() { - int arr[sizeof(arr) != sizeof(int) ? 1 : -1]; + static_assert(sizeof(arr) != sizeof(int), ""); } template<> int R::arr[2]; template void R::f(); @@ -842,11 +847,10 @@ namespace cwg451 { // cwg451: yes // expected-warning@-1 {{division by zero is undefined}} const int b = 1 / 0; // #cwg451-b // expected-warning@-1 {{division by zero is undefined}} - int arr[b]; // #cwg451-arr - // expected-error@-1 {{variable length arrays in C++ are a Clang extension}} + static_assert(b, ""); + // expected-error@-1 {{expression is not an integral constant expression}} // expected-note@-2 {{initializer of 'b' is not a constant expression}} // expected-note@#cwg451-b {{declared here}} - // expected-error@#cwg451-arr {{variable length array declaration not allowed at file scope}} } namespace cwg452 { // cwg452: yes @@ -876,11 +880,10 @@ namespace cwg456 { // cwg456: yes namespace cwg457 { // cwg457: yes const int a = 1; const volatile int b = 1; - int ax[a]; - int bx[b]; - // expected-error@-1 {{variable length arrays in C++ are a Clang extension}} + static_assert(a, ""); + static_assert(b, ""); + // expected-error@-1 {{expression is not an integral constant expression}} // expected-note@-2 {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}} - // expected-error@-3 {{variable length array declaration not allowed at file scope}} enum E { ea = a, @@ -1276,20 +1279,18 @@ namespace cwg482 { // cwg482: 3.5 namespace cwg483 { // cwg483: yes namespace climits { - int check1[__SCHAR_MAX__ >= 127 ? 1 : -1]; - int check2[__SHRT_MAX__ >= 32767 ? 1 : -1]; - int check3[__INT_MAX__ >= 32767 ? 1 : -1]; - int check4[__LONG_MAX__ >= 2147483647 ? 1 : -1]; - int check5[__LONG_LONG_MAX__ >= 9223372036854775807 ? 1 : -1]; - // cxx98-error@-1 {{'long long' is a C++11 extension}} - // cxx98-error@-2 0-1{{'long long' is a C++11 extension}} + static_assert(__SCHAR_MAX__ >= 127, ""); + static_assert(__SHRT_MAX__ >= 32767, ""); + static_assert(__INT_MAX__ >= 32767, ""); + static_assert(__LONG_MAX__ >= 2147483647, ""); + static_assert(__LONG_LONG_MAX__ >= 9223372036854775807, ""); } namespace cstdint { - int check1[__PTRDIFF_WIDTH__ >= 16 ? 1 : -1]; - int check2[__SIG_ATOMIC_WIDTH__ >= 8 ? 1 : -1]; - int check3[__SIZE_WIDTH__ >= 16 ? 1 : -1]; - int check4[__WCHAR_WIDTH__ >= 8 ? 1 : -1]; - int check5[__WINT_WIDTH__ >= 16 ? 1 : -1]; + static_assert(__PTRDIFF_WIDTH__ >= 16, ""); + static_assert(__SIG_ATOMIC_WIDTH__ >= 8, ""); + static_assert(__SIZE_WIDTH__ >= 16, ""); + static_assert(__WCHAR_WIDTH__ >= 8, ""); + static_assert(__WINT_WIDTH__ >= 16, ""); } } @@ -1366,11 +1367,10 @@ namespace cwg486 { // cwg486: yes namespace cwg487 { // cwg487: yes enum E { e }; int operator+(int, E); // #cwg487-operator-plus - int i[4 + e]; // #cwg487-i - // expected-error@-1 {{variable length arrays in C++ are a Clang extension}} + static_assert(4 + e, ""); + // expected-error@-1 {{expression is not an integral constant expression}} // since-cxx11-note@-2 {{non-constexpr function 'operator+' cannot be used in a constant expression}} // since-cxx11-note@#cwg487-operator-plus {{declared here}} - // expected-error@#cwg487-i {{variable length array declaration not allowed at file scope}} } namespace cwg488 { // cwg488: yes c++11 @@ -1485,13 +1485,13 @@ namespace cwg495 { // cwg495: 3.5 namespace cwg496 { // cwg496: sup 2094 struct A { int n; }; struct B { volatile int n; }; - int check1[ __is_trivially_copyable(const int) ? 1 : -1]; + static_assert(__is_trivially_copyable(const int), ""); // This checks the cwg2094 behavior, not cwg496 - int check2[ __is_trivially_copyable(volatile int) ? 1 : -1]; - int check3[ __is_trivially_constructible(A, const A&) ? 1 : -1]; - int check4[ __is_trivially_constructible(B, const B&) ? 1 : -1]; - int check5[ __is_trivially_assignable(A, const A&) ? 1 : -1]; - int check6[ __is_trivially_assignable(B, const B&) ? 1 : -1]; + static_assert(__is_trivially_copyable(volatile int), ""); + static_assert(__is_trivially_constructible(A, const A&), ""); + static_assert(__is_trivially_constructible(B, const B&), ""); + static_assert(__is_trivially_assignable(A, const A&), ""); + static_assert(__is_trivially_assignable(B, const B&), ""); } namespace cwg497 { // cwg497: sup 253 diff --git a/clang/test/CXX/drs/dr5xx.cpp b/clang/test/CXX/drs/dr5xx.cpp index 0fe64102d70b00..9d890f981348a7 100644 --- a/clang/test/CXX/drs/dr5xx.cpp +++ b/clang/test/CXX/drs/dr5xx.cpp @@ -5,6 +5,11 @@ // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx23,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +#if __cplusplus == 199711L +#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) +// cxx98-error@-1 {{variadic macros are a C99 feature}} +#endif + // FIXME: This is included to avoid a diagnostic with no source location // pointing at the implicit operator new. We can't match such a diagnostic // with -verify. @@ -819,7 +824,7 @@ namespace cwg565 { // cwg565: yes namespace cwg566 { // cwg566: yes #if __cplusplus >= 201103L - int check[int(-3.99) == -3 ? 1 : -1]; + static_assert(int(-3.99) == -3, ""); #endif } @@ -834,7 +839,7 @@ namespace cwg568 { // cwg568: 3.0 c++11 public: int n; }; - int check_trivial[__is_trivial(trivial) ? 1 : -1]; + static_assert(__is_trivial(trivial), ""); struct std_layout { std_layout(); @@ -843,7 +848,7 @@ namespace cwg568 { // cwg568: 3.0 c++11 private: int n; }; - int check_std_layout[__is_standard_layout(std_layout) ? 1 : -1]; + static_assert(__is_standard_layout(std_layout), ""); struct aggregate { int x; @@ -885,7 +890,7 @@ namespace cwg570 { // cwg570: dup 633 namespace cwg572 { // cwg572: yes enum E { a = 1, b = 2 }; - int check[a + b == 3 ? 1 : -1]; + static_assert(a + b == 3, ""); } namespace cwg573 { // cwg573: no diff --git a/clang/test/CXX/drs/dr6xx.cpp b/clang/test/CXX/drs/dr6xx.cpp index 9d3613ae8589ea..069102d9c59750 100644 --- a/clang/test/CXX/drs/dr6xx.cpp +++ b/clang/test/CXX/drs/dr6xx.cpp @@ -144,7 +144,7 @@ namespace cwg608 { // cwg608: yes struct D : B, C {}; } -int cwg610[-0u == 0u ? 1 : -1]; // cwg610: yes +static_assert(-0u == 0u, ""); // cwg610: yes namespace cwg611 { // cwg611: yes int k; @@ -190,8 +190,8 @@ namespace cwg613 { // cwg613: yes c++11 } } -int cwg614_a[(-1) / 2 == 0 ? 1 : -1]; // cwg614: yes -int cwg614_b[(-1) % 2 == -1 ? 1 : -1]; +static_assert((-1) / 2 == 0, ""); // cwg614: yes +static_assert((-1) % 2 == -1, ""); namespace cwg615 { // cwg615: yes int f(); From 6b83fe552990966fdad0e5693a79b02b87d9526e Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 16 Apr 2024 11:03:53 -0700 Subject: [PATCH 137/300] [RISCV] Strength reduce mul by 2^n + 2/4/8 + 1 (#88911) With zba, we can expand this to (add (shl X, C1), (shXadd X, X)). Note that this is our first expansion to a three instruction sequence. I believe this to general be a reasonable tradeoff for most architectures, but we may want to (someday) consider a tuning flag here. I plan to support 2^n + (2/4/8 + 1) eventually as well, but that comes behind 2^N - 2^M. Both are also three instruction sequences. --------- Co-authored-by: Min-Yih Hsu --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 37 +++++++++++++++ llvm/test/CodeGen/RISCV/rv64zba.ll | 51 +++++++++++++++------ 2 files changed, 73 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index de2ad639f0d6c8..dc7c6f83b98579 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13437,6 +13437,43 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2); } } + + // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x) + // Matched in tablegen, avoid perturbing patterns. + switch (MulAmt) { + case 11: + case 13: + case 19: + case 21: + case 25: + case 27: + case 29: + case 37: + case 41: + case 45: + case 73: + case 91: + return SDValue(); + default: + break; + } + + // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X)) + if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) { + unsigned ScaleShift = llvm::countr_zero(MulAmt - 1); + if (ScaleShift >= 1 && ScaleShift < 4) { + unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2))); + SDLoc DL(N); + SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(ShiftAmt, DL, VT)); + SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(ScaleShift, DL, VT)); + return DAG.getNode( + ISD::ADD, DL, VT, Shift1, + DAG.getNode(ISD::ADD, DL, VT, Shift2, N->getOperand(0))); + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index c3c757656be933..b4c80b60e0bad5 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -598,31 +598,52 @@ define i64 @mul125(i64 %a) { } define i64 @mul131(i64 %a) { -; CHECK-LABEL: mul131: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 131 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: mul131: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 131 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul131: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh1add a1, a0, a0 +; RV64ZBA-NEXT: slli a0, a0, 7 +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: ret %c = mul i64 %a, 131 ret i64 %c } define i64 @mul133(i64 %a) { -; CHECK-LABEL: mul133: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 133 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: mul133: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 133 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul133: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh2add a1, a0, a0 +; RV64ZBA-NEXT: slli a0, a0, 7 +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: ret %c = mul i64 %a, 133 ret i64 %c } define i64 @mul137(i64 %a) { -; CHECK-LABEL: mul137: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 137 -; CHECK-NEXT: mul a0, a0, a1 -; CHECK-NEXT: ret +; RV64I-LABEL: mul137: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 137 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul137: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh3add a1, a0, a0 +; RV64ZBA-NEXT: slli a0, a0, 7 +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: ret %c = mul i64 %a, 137 ret i64 %c } From 1c2afbae9af22b58190c10e3517242d01d89d612 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 16 Apr 2024 11:05:45 -0700 Subject: [PATCH 138/300] [CodeGen,test] Test llvm-libc style alias attribute with UsingShadowDecl The pattern is quite involved and deserves a specific codegen test. This test would catch the bug in the first attempt of #87130 --- clang/test/CodeGen/alias.cpp | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/clang/test/CodeGen/alias.cpp b/clang/test/CodeGen/alias.cpp index 17c1e1ae32f035..a468c31d369ed0 100644 --- a/clang/test/CodeGen/alias.cpp +++ b/clang/test/CodeGen/alias.cpp @@ -1,27 +1,42 @@ -// RUN: %clang_cc1 -triple x86_64-linux -verify -emit-llvm-only %s -// RUN: not %clang_cc1 -triple x86_64-linux -emit-llvm-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-linux -verify -emit-llvm-only -DERR %s +// RUN: not %clang_cc1 -triple x86_64-linux -emit-llvm-only -fdiagnostics-parseable-fixits -DERR %s 2>&1 | FileCheck %s --check-prefix=FIXIT +// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm %s -o - | FileCheck %s +#ifdef ERR void *f1_ifunc(void) { return nullptr; } void f1(void) __attribute__((alias("f1_ifunc"))); // expected-error@-1 {{alias must point to a defined variable or function}} // expected-note@-2 {{must refer to its mangled name}} // expected-note@-3 {{function by that name is mangled as}} -// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:30-[[@LINE-4]]:47}:"alias(\"_Z8f1_ifuncv\")" +// FIXIT: fix-it:"{{.*}}":{[[@LINE-4]]:30-[[@LINE-4]]:47}:"alias(\"_Z8f1_ifuncv\")" void *f6_resolver_resolver(void) { return 0; } void *f6_resolver(void) __attribute__((alias("f6_resolver_resolver"))); // expected-error@-1 {{alias must point to a defined variable or function}} // expected-note@-2 {{must refer to its mangled name}} // expected-note@-3 {{function by that name is mangled as}} -// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:40-[[@LINE-4]]:69}:"alias(\"_Z20f6_resolver_resolverv\")" +// FIXIT: fix-it:"{{.*}}":{[[@LINE-4]]:40-[[@LINE-4]]:69}:"alias(\"_Z20f6_resolver_resolverv\")" void f6(void) __attribute__((alias("f6_resolver"))); // expected-error@-1 {{alias must point to a defined variable or function}} // expected-note@-2 {{must refer to its mangled name}} // expected-note@-3 {{function by that name is mangled as}} -// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:30-[[@LINE-4]]:50}:"alias(\"_Z11f6_resolverv\")" +// FIXIT: fix-it:"{{.*}}":{[[@LINE-4]]:30-[[@LINE-4]]:50}:"alias(\"_Z11f6_resolverv\")" __attribute__((unused, alias("resolver"), deprecated("hahahaha, isn't C great?"))) void func(); // expected-error@-2 {{alias must point to a defined variable or function}} // expected-note@-3 {{must refer to its mangled name}} +#endif +// CHECK: @_ZN4libc4log2Ed ={{.*}} alias double (double), ptr @log2 +// CHECK: define{{.*}} @log2( +namespace libc { double log2(double x); } +extern "C" double log2(double); +namespace std { using ::log2; } +using std::log2; + +namespace libc { +decltype(libc::log2) __log2_impl__ __asm__("log2"); +decltype(libc::log2) log2 [[gnu::alias("log2")]]; +double __log2_impl__(double x) { return x; } +} From 5462b27026dee886fb896980d6ad9487200a6cbe Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Tue, 16 Apr 2024 20:07:03 +0200 Subject: [PATCH 139/300] [NFC][libc++][TZDB] Refactors argument order. (#85781) Putting the output reference argument first looks more sensible. --- libcxx/include/__chrono/formatter.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h index 4ad59382a4148a..b64cae529a294d 100644 --- a/libcxx/include/__chrono/formatter.h +++ b/libcxx/include/__chrono/formatter.h @@ -79,7 +79,7 @@ namespace __formatter { // small). Therefore a duration uses its own conversion. template _LIBCPP_HIDE_FROM_ABI void -__format_sub_seconds(const chrono::duration<_Rep, _Period>& __value, basic_stringstream<_CharT>& __sstr) { +__format_sub_seconds(basic_stringstream<_CharT>& __sstr, const chrono::duration<_Rep, _Period>& __value) { __sstr << std::use_facet>(__sstr.getloc()).decimal_point(); using __duration = chrono::duration<_Rep, _Period>; @@ -110,13 +110,13 @@ __format_sub_seconds(const chrono::duration<_Rep, _Period>& __value, basic_strin } template -_LIBCPP_HIDE_FROM_ABI void __format_sub_seconds(const _Tp& __value, basic_stringstream<_CharT>& __sstr) { - __formatter::__format_sub_seconds(__value.time_since_epoch(), __sstr); +_LIBCPP_HIDE_FROM_ABI void __format_sub_seconds(basic_stringstream<_CharT>& __sstr, const _Tp& __value) { + __formatter::__format_sub_seconds(__sstr, __value.time_since_epoch()); } template _LIBCPP_HIDE_FROM_ABI void -__format_sub_seconds(const chrono::hh_mm_ss<_Duration>& __value, basic_stringstream<_CharT>& __sstr) { +__format_sub_seconds(basic_stringstream<_CharT>& __sstr, const chrono::hh_mm_ss<_Duration>& __value) { __sstr << std::use_facet>(__sstr.getloc()).decimal_point(); if constexpr (chrono::treat_as_floating_point_v) std::format_to(std::ostreambuf_iterator<_CharT>{__sstr}, @@ -143,7 +143,7 @@ consteval bool __use_fraction() { } template -_LIBCPP_HIDE_FROM_ABI void __format_year(int __year, basic_stringstream<_CharT>& __sstr) { +_LIBCPP_HIDE_FROM_ABI void __format_year(basic_stringstream<_CharT>& __sstr, int __year) { if (__year < 0) { __sstr << _CharT('-'); __year = -__year; @@ -159,7 +159,7 @@ _LIBCPP_HIDE_FROM_ABI void __format_year(int __year, basic_stringstream<_CharT>& } template -_LIBCPP_HIDE_FROM_ABI void __format_century(int __year, basic_stringstream<_CharT>& __sstr) { +_LIBCPP_HIDE_FROM_ABI void __format_century(basic_stringstream<_CharT>& __sstr, int __year) { // TODO FMT Write an issue // [tab:time.format.spec] // %C The year divided by 100 using floored division. If the result is a @@ -172,7 +172,7 @@ _LIBCPP_HIDE_FROM_ABI void __format_century(int __year, basic_stringstream<_Char template _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs( - const _Tp& __value, basic_stringstream<_CharT>& __sstr, basic_string_view<_CharT> __chrono_specs) { + basic_stringstream<_CharT>& __sstr, const _Tp& __value, basic_string_view<_CharT> __chrono_specs) { tm __t = std::__convert_to_tm(__value); const auto& __facet = std::use_facet>(__sstr.getloc()); for (auto __it = __chrono_specs.begin(); __it != __chrono_specs.end(); ++__it) { @@ -196,7 +196,7 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs( // strftime's output is only defined in the range [00, 99]. int __year = __t.tm_year + 1900; if (__year < 1000 || __year > 9999) - __formatter::__format_century(__year, __sstr); + __formatter::__format_century(__sstr, __year); else __facet.put( {__sstr}, __sstr, _CharT(' '), std::addressof(__t), std::to_address(__s), std::to_address(__it + 1)); @@ -242,7 +242,7 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs( __facet.put( {__sstr}, __sstr, _CharT(' '), std::addressof(__t), std::to_address(__s), std::to_address(__it + 1)); if constexpr (__use_fraction<_Tp>()) - __formatter::__format_sub_seconds(__value, __sstr); + __formatter::__format_sub_seconds(__sstr, __value); break; // Unlike time_put and strftime the formatting library requires %Y @@ -283,13 +283,13 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs( // Depending on the platform's libc the range of supported years is // limited. Intead of of testing all conditions use the internal // implementation unconditionally. - __formatter::__format_year(__t.tm_year + 1900, __sstr); + __formatter::__format_year(__sstr, __t.tm_year + 1900); break; case _CharT('F'): { int __year = __t.tm_year + 1900; if (__year < 1000) { - __formatter::__format_year(__year, __sstr); + __formatter::__format_year(__sstr, __year); __sstr << std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "-{:02}-{:02}"), __t.tm_mon + 1, __t.tm_mday); } else __facet.put( @@ -310,7 +310,7 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs( ++__it; __facet.put( {__sstr}, __sstr, _CharT(' '), std::addressof(__t), std::to_address(__s), std::to_address(__it + 1)); - __formatter::__format_sub_seconds(__value, __sstr); + __formatter::__format_sub_seconds(__sstr, __value); break; } } @@ -512,7 +512,7 @@ __format_chrono(const _Tp& __value, if constexpr (chrono::__is_duration<_Tp>::value) { if (__value < __value.zero()) __sstr << _CharT('-'); - __formatter::__format_chrono_using_chrono_specs(chrono::abs(__value), __sstr, __chrono_specs); + __formatter::__format_chrono_using_chrono_specs(__sstr, chrono::abs(__value), __chrono_specs); // TODO FMT When keeping the precision it will truncate the string. // Note that the behaviour what the precision does isn't specified. __specs.__precision_ = -1; @@ -556,7 +556,7 @@ __format_chrono(const _Tp& __value, __sstr << _CharT('-'); } - __formatter::__format_chrono_using_chrono_specs(__value, __sstr, __chrono_specs); + __formatter::__format_chrono_using_chrono_specs(__sstr, __value, __chrono_specs); } } From a75c9d059791f5d175f6c263d114d59e51b46120 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Tue, 16 Apr 2024 20:18:34 +0200 Subject: [PATCH 140/300] [NFC][libc++] Moves ios_base's forward declaration. (#88027) According to our synopsis it belonged to ios_fwd. This is not true in the C++11 version of the Standard, I did not validate against C++98. Moving this to ios's forward where it's declared in the standard allows removing a module quirk. An earlier removal of std::vectors forward declaration allows to remove all quirks for the iosfwd module part. Since iosfwd includes __fwd/ios.h this does not change the required includes. --- libcxx/include/__fwd/ios.h | 2 ++ libcxx/include/iosfwd | 3 --- libcxx/utils/libcxx/test/modules.py | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/libcxx/include/__fwd/ios.h b/libcxx/include/__fwd/ios.h index 82c865d58cc751..48350709d4ce25 100644 --- a/libcxx/include/__fwd/ios.h +++ b/libcxx/include/__fwd/ios.h @@ -18,6 +18,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD +class _LIBCPP_EXPORTED_FROM_ABI ios_base; + template > class _LIBCPP_TEMPLATE_VIS basic_ios; diff --git a/libcxx/include/iosfwd b/libcxx/include/iosfwd index 9af5e05031850d..2481667dd972cf 100644 --- a/libcxx/include/iosfwd +++ b/libcxx/include/iosfwd @@ -25,7 +25,6 @@ template<> struct char_traits; template class allocator; -class ios_base; template > class basic_ios; template > class basic_streambuf; @@ -124,8 +123,6 @@ using wosyncstream = basic_osyncstream; // C++20 _LIBCPP_BEGIN_NAMESPACE_STD -class _LIBCPP_EXPORTED_FROM_ABI ios_base; - template > class _LIBCPP_TEMPLATE_VIS istreambuf_iterator; template > diff --git a/libcxx/utils/libcxx/test/modules.py b/libcxx/utils/libcxx/test/modules.py index 3f3c7999a1a21d..44c6292ff1140f 100644 --- a/libcxx/utils/libcxx/test/modules.py +++ b/libcxx/utils/libcxx/test/modules.py @@ -26,8 +26,6 @@ # The operators are added for private types like __iom_t10. SkipDeclarations["iomanip"] = ["std::operator<<", "std::operator>>"] -SkipDeclarations["iosfwd"] = ["std::ios_base", "std::vector"] - # This header also provides declarations in the namespace that might be # an error. SkipDeclarations["filesystem"] = [ From 9cd3e92f05fcc2c9168a7abc56d08f0d33bfdfdf Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Tue, 16 Apr 2024 20:19:53 +0200 Subject: [PATCH 141/300] [libc++][modules] Removes some validation quirks. (#88031) Recent unrelated header cleanups caused these quirks to become obsolete. --- libcxx/utils/libcxx/test/modules.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/libcxx/utils/libcxx/test/modules.py b/libcxx/utils/libcxx/test/modules.py index 44c6292ff1140f..aab7651c7bb039 100644 --- a/libcxx/utils/libcxx/test/modules.py +++ b/libcxx/utils/libcxx/test/modules.py @@ -52,8 +52,6 @@ "std::operator==", ] -# Declared in the forward header since std::string uses std::allocator -SkipDeclarations["string"] = ["std::allocator"] # TODO MODULES remove zombie names # https://libcxx.llvm.org/Status/Cxx20.html#note-p0619 SkipDeclarations["memory"] = [ @@ -61,9 +59,6 @@ "std::get_temporary_buffer", ] -# TODO MODULES this should be part of ios instead -SkipDeclarations["streambuf"] = ["std::basic_ios"] - # include/__type_traits/is_swappable.h SkipDeclarations["type_traits"] = [ "std::swap", From 41a830500aa5556a65198607ec751d8e3254c949 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Tue, 16 Apr 2024 20:20:37 +0200 Subject: [PATCH 142/300] [libc++] Removes deprecated _LIBCPP_ENABLE__REMOVED_FEATURES macros (#88548) We marked those macros as deprecated in the last release with the intent of removing them in LLVM 19. This commit performs the removal. --- libcxx/docs/ReleaseNotes/19.rst | 2 +- libcxx/docs/UsingLibcxx.rst | 12 ------- libcxx/include/__config | 26 -------------- ...le_removed_cpp17_features.compile.pass.cpp | 36 ------------------- ...moved_cpp17_features.deprecated.verify.cpp | 20 ----------- ...moved_cpp20_features.deprecated.verify.cpp | 20 ----------- 6 files changed, 1 insertion(+), 115 deletions(-) delete mode 100644 libcxx/test/libcxx/depr/enable_removed_cpp17_features.compile.pass.cpp delete mode 100644 libcxx/test/libcxx/depr/enable_removed_cpp17_features.deprecated.verify.cpp delete mode 100644 libcxx/test/libcxx/depr/enable_removed_cpp20_features.deprecated.verify.cpp diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst index 45aac88e455024..53cc7a77d1af48 100644 --- a/libcxx/docs/ReleaseNotes/19.rst +++ b/libcxx/docs/ReleaseNotes/19.rst @@ -97,7 +97,7 @@ Deprecations and Removals - The ``_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS`` and ``_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_VOID_SPECIALIZATION`` macros have been removed in LLVM 19. -- TODO: The ``_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES`` and ``_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES`` macros have +- The ``_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES`` and ``_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES`` macros have been removed in LLVM 19. C++17 and C++20 removed features can still be re-enabled individually. - The ``_LIBCPP_INLINE_VISIBILITY`` and ``_VSTD`` macros have been removed in LLVM 19. diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst index c0e85ad4d5e247..8f945656de1ca6 100644 --- a/libcxx/docs/UsingLibcxx.rst +++ b/libcxx/docs/UsingLibcxx.rst @@ -208,12 +208,6 @@ safety annotations. C++17 Specific Configuration Macros ----------------------------------- -**_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES**: - This macro is used to re-enable all the features removed in C++17. The effect - is equivalent to manually defining each macro listed below. - This macro is deprecated and will be removed in LLVM-19. Use the - individual macros listed below. - **_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR**: This macro is used to re-enable `auto_ptr`. @@ -238,12 +232,6 @@ C++20 Specific Configuration Macros This macro is used to re-enable the function ``std::shared_ptr<...>::unique()``. -**_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES**: - This macro is used to re-enable all the features removed in C++20. The effect - is equivalent to manually defining each macro listed below. - This macro is deprecated and will be removed in LLVM-19. Use the - individual macros listed below. - **_LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS**: This macro is used to re-enable the `argument_type`, `result_type`, `first_argument_type`, and `second_argument_type` members of class diff --git a/libcxx/include/__config b/libcxx/include/__config index 9b4155af1e3c65..4ccef2ca0d73b4 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -16,17 +16,6 @@ # pragma GCC system_header #endif -#if defined(_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES) && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS) -# pragma clang deprecated( \ - _LIBCPP_ENABLE_CXX17_REMOVED_FEATURES, \ - "_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES is deprecated in LLVM 18 and will be removed in LLVM 19") -#endif -#if defined(_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES) && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS) -# pragma clang deprecated( \ - _LIBCPP_ENABLE_CXX20_REMOVED_FEATURES, \ - "_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES is deprecated in LLVM 18 and will be removed in LLVM 19") -#endif - #if defined(__apple_build_version__) // Given AppleClang XX.Y.Z, _LIBCPP_APPLE_CLANG_VER is XXYZ (e.g. AppleClang 14.0.3 => 1403) # define _LIBCPP_COMPILER_CLANG_BASED @@ -1230,21 +1219,6 @@ typedef __char32_t char32_t; # define _LIBCPP_IF_WIDE_CHARACTERS(...) __VA_ARGS__ # endif -# if defined(_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES) -# define _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR -# define _LIBCPP_ENABLE_CXX17_REMOVED_BINDERS -# define _LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE -# define _LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS -# define _LIBCPP_ENABLE_CXX17_REMOVED_UNARY_BINARY_FUNCTION -# endif // _LIBCPP_ENABLE_CXX17_REMOVED_FEATURES - -# if defined(_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES) -# define _LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS -# define _LIBCPP_ENABLE_CXX20_REMOVED_NEGATORS -# define _LIBCPP_ENABLE_CXX20_REMOVED_RAW_STORAGE_ITERATOR -# define _LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS -# endif // _LIBCPP_ENABLE_CXX20_REMOVED_FEATURES - // clang-format off # define _LIBCPP_PUSH_MACROS _Pragma("push_macro(\"min\")") _Pragma("push_macro(\"max\")") _Pragma("push_macro(\"refresh\")") _Pragma("push_macro(\"move\")") _Pragma("push_macro(\"erase\")") # define _LIBCPP_POP_MACROS _Pragma("pop_macro(\"min\")") _Pragma("pop_macro(\"max\")") _Pragma("pop_macro(\"refresh\")") _Pragma("pop_macro(\"move\")") _Pragma("pop_macro(\"erase\")") diff --git a/libcxx/test/libcxx/depr/enable_removed_cpp17_features.compile.pass.cpp b/libcxx/test/libcxx/depr/enable_removed_cpp17_features.compile.pass.cpp deleted file mode 100644 index 1b7acad3cfa464..00000000000000 --- a/libcxx/test/libcxx/depr/enable_removed_cpp17_features.compile.pass.cpp +++ /dev/null @@ -1,36 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// Test that defining _LIBCPP_ENABLE_CXX17_REMOVED_FEATURES correctly defines -// _LIBCPP_ENABLE_CXX17_REMOVED_FOO for each individual component macro. - -// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES -Wno-deprecated-pragma - -#include <__config> - -#include "test_macros.h" - -#ifndef _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR -# error _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR must be defined -#endif - -#ifndef _LIBCPP_ENABLE_CXX17_REMOVED_BINDERS -# error _LIBCPP_ENABLE_CXX17_REMOVED_BINDERS must be defined -#endif - -#ifndef _LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE -# error _LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE must be defined -#endif - -#ifndef _LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS -#error _LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS must be defined -#endif - -#ifndef _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR -#error _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR must be defined -#endif diff --git a/libcxx/test/libcxx/depr/enable_removed_cpp17_features.deprecated.verify.cpp b/libcxx/test/libcxx/depr/enable_removed_cpp17_features.deprecated.verify.cpp deleted file mode 100644 index 059c1b3ead4f15..00000000000000 --- a/libcxx/test/libcxx/depr/enable_removed_cpp17_features.deprecated.verify.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// <__config> - -// Ensure that defining _LIBCPP_ENABLE_CXX17_REMOVED_FEATURES yields a -// deprecation warning. We intend to issue a deprecation warning in LLVM 18 -// and remove the macro entirely in LLVM 19. As such, this test will be quite -// short lived. - -// UNSUPPORTED: clang-modules-build - -// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES - -#include <__config> // expected-warning@* 1+ {{macro '_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES' has been marked as deprecated}} diff --git a/libcxx/test/libcxx/depr/enable_removed_cpp20_features.deprecated.verify.cpp b/libcxx/test/libcxx/depr/enable_removed_cpp20_features.deprecated.verify.cpp deleted file mode 100644 index 163ff7d8fbda03..00000000000000 --- a/libcxx/test/libcxx/depr/enable_removed_cpp20_features.deprecated.verify.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// <__config> - -// Ensure that defining _LIBCPP_ENABLE_CXX20_REMOVED_FEATURES yields a -// deprecation warning. We intend to issue a deprecation warning in LLVM 18 -// and remove the macro entirely in LLVM 19. As such, this test will be quite -// short lived. - -// UNSUPPORTED: clang-modules-build - -// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES - -#include // expected-warning@* 1+ {{macro '_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES' has been marked as deprecated}} From 388da6a31b7ba3062f9306b894656e265b9b33eb Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Tue, 16 Apr 2024 20:21:39 +0200 Subject: [PATCH 143/300] [libc++][test] Removes Clang 16 validation. (#88558) --- libcxx/test/libcxx/containers/sequences/vector/asan.pass.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libcxx/test/libcxx/containers/sequences/vector/asan.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/asan.pass.cpp index 588ce2a3d17edc..614323b1ffd7be 100644 --- a/libcxx/test/libcxx/containers/sequences/vector/asan.pass.cpp +++ b/libcxx/test/libcxx/containers/sequences/vector/asan.pass.cpp @@ -29,8 +29,7 @@ void do_exit() { int main(int, char**) { -#if TEST_STD_VER >= 11 && TEST_CLANG_VER >= 1600 - // TODO(LLVM-18): Remove the special-casing +#if TEST_STD_VER >= 11 { typedef int T; typedef cpp17_input_iterator MyInputIter; @@ -52,7 +51,7 @@ int main(int, char**) assert(v[1] == 'b'); assert(is_contiguous_container_asan_correct(v)); } -#endif +#endif // TEST_STD_VER >= 11 { typedef cpp17_input_iterator MyInputIter; // Sould not trigger ASan. From 8e0a4a89f940d17b520bbca040981f54195d3ea4 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Tue, 16 Apr 2024 20:22:48 +0200 Subject: [PATCH 144/300] [libc++][doc] Documents -DLIBCXX_INSTALL_MODULES=ON. (#88547) Co-authored-by: Louis Dionne --- libcxx/docs/BuildingLibcxx.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst index 28145ed1049e0f..a0a0cdb4339749 100644 --- a/libcxx/docs/BuildingLibcxx.rst +++ b/libcxx/docs/BuildingLibcxx.rst @@ -206,6 +206,12 @@ libc++ specific options Toggle the installation of the libc++ headers. +.. option:: LIBCXX_INSTALL_MODULES:BOOL + + **Default**: ``OFF`` + + Toggle the installation of the experimental libc++ module sources. + .. option:: LIBCXX_ENABLE_SHARED:BOOL **Default**: ``ON`` From 002297bdaa63f3f9f56b0051110ccf48f31c6825 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 16 Apr 2024 14:28:58 -0400 Subject: [PATCH 145/300] [gn] port 22629bb22a1b --- llvm/utils/gn/secondary/libcxx/src/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn index 1f6879358f22bc..955854c7a134bc 100644 --- a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn @@ -125,6 +125,7 @@ cxx_sources = [ "condition_variable_destructor.cpp", "error_category.cpp", "exception.cpp", + "expected.cpp", "fstream.cpp", "functional.cpp", "future.cpp", From 8debcf03c535e14ee47b14fddfcaeae3f32d1317 Mon Sep 17 00:00:00 2001 From: Peiming Liu Date: Tue, 16 Apr 2024 11:31:09 -0700 Subject: [PATCH 146/300] [mlir][sparse] introduce sparse_tensor.iterate operation (#88807) A `sparse_tensor.iterate` iterates over a sparse iteration space extracted from `sparse_tensor.extract_iteration_space` operation introduced in https://github.com/llvm/llvm-project/pull/88554. *DO NOT MERGE* before https://github.com/llvm/llvm-project/pull/88554 --- .../Dialect/SparseTensor/IR/SparseTensor.h | 38 ++ .../SparseTensor/IR/SparseTensorAttrDefs.td | 15 + .../SparseTensor/IR/SparseTensorOps.td | 152 +++++++- .../SparseTensor/IR/SparseTensorTypes.td | 95 +++++ .../SparseTensor/IR/SparseTensorDialect.cpp | 365 ++++++++++++++++++ mlir/test/Dialect/SparseTensor/invalid.mlir | 139 +++++++ mlir/test/Dialect/SparseTensor/roundtrip.mlir | 53 +++ .../SparseTensor/sparse_itertion_licm.mlir | 26 ++ 8 files changed, 882 insertions(+), 1 deletion(-) create mode 100644 mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h index 5e523ec428aefb..081a9b8cad8d62 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h @@ -17,9 +17,13 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/IR/OpImplementation.h" #include "mlir/IR/TensorEncoding.h" +#include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/InferTypeOpInterface.h" +#include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#include "llvm/ADT/bit.h" + //===----------------------------------------------------------------------===// // // Type aliases to help code be more self-documenting. Unfortunately @@ -41,6 +45,40 @@ using Level = uint64_t; /// including the value `ShapedType::kDynamic` (for shapes). using Size = int64_t; +/// A simple wrapper to encode a bitset of defined (at most 64) levels. +class LevelSet { + uint64_t bits = 0; + +public: + LevelSet() = default; + explicit LevelSet(uint64_t bits) : bits(bits) {} + operator uint64_t() const { return bits; } + + LevelSet &set(unsigned i) { + assert(i < 64); + bits |= 1 << i; + return *this; + } + + LevelSet &operator|=(LevelSet lhs) { + bits |= static_cast(lhs); + return *this; + } + + LevelSet &lshift(unsigned offset) { + bits = bits << offset; + return *this; + } + + bool operator[](unsigned i) const { + assert(i < 64); + return (bits & (1 << i)) != 0; + } + + unsigned count() const { return llvm::popcount(bits); } + bool empty() const { return bits == 0; } +}; + } // namespace sparse_tensor } // namespace mlir diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td index 4a9b9169ae4b86..d5398a98f5b171 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td @@ -19,6 +19,21 @@ class SparseTensor_Attr traits = []> : AttrDef; +//===----------------------------------------------------------------------===// +// A simple bitset attribute wrapped over a single int64_t to encode a set of +// sparse tensor levels. +//===----------------------------------------------------------------------===// + +def LevelSetAttr : + TypedAttrBase< + I64, "IntegerAttr", + And<[CPred<"::llvm::isa<::mlir::IntegerAttr>($_self)">, + CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getType().isInteger(64)">]>, + "LevelSet attribute"> { + let returnType = [{::mlir::sparse_tensor::LevelSet}]; + let convertFromStorage = [{::mlir::sparse_tensor::LevelSet($_self.getValue().getZExtValue())}]; +} + //===----------------------------------------------------------------------===// // These attributes are just like `IndexAttr` except that they clarify whether // the index refers to a dimension (an axis of the semantic tensor) or a level diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td index 0cfc64f9988a0a..b43d716d5e8642 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td @@ -15,6 +15,8 @@ include "mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td" include "mlir/Dialect/SparseTensor/IR/SparseTensorInterfaces.td" include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/Interfaces/ControlFlowInterfaces.td" +include "mlir/Interfaces/LoopLikeInterface.td" //===----------------------------------------------------------------------===// // Base class. @@ -1277,7 +1279,7 @@ def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResu def SparseTensor_YieldOp : SparseTensor_Op<"yield", [Pure, Terminator, ParentOneOf<["BinaryOp", "UnaryOp", "ReduceOp", "SelectOp", - "ForeachOp"]>]>, + "ForeachOp", "IterateOp"]>]>, Arguments<(ins Variadic:$results)> { let summary = "Yield from sparse_tensor set-like operations"; let description = [{ @@ -1430,6 +1432,154 @@ def SparseTensor_ForeachOp : SparseTensor_Op<"foreach", let hasVerifier = 1; } +//===----------------------------------------------------------------------===// +// Sparse Tensor Iteration Operations. +//===----------------------------------------------------------------------===// + +def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space", + [Pure, DeclareOpInterfaceMethods]> { + + let arguments = (ins AnySparseTensor:$tensor, + Optional:$parentIter, + LevelAttr:$loLvl, LevelAttr:$hiLvl); + + let results = (outs AnySparseIterSpace:$resultSpace); + + let summary = "Extract an iteration space from a sparse tensor between certain levels"; + let description = [{ + Extracts a `!sparse_tensor.iter_space` from a sparse tensor between + certian (consecutive) levels. + + `tensor`: the input sparse tensor that defines the iteration space. + `parentIter`: the iterator for the previous level, at which the iteration space + at the current levels will be extracted. + `loLvl`, `hiLvl`: the level range between [loLvl, hiLvl) in the input tensor that + the returned iteration space covers. `hiLvl - loLvl` defines the dimension of the + iteration space. + + Example: + ```mlir + // Extracts a 1-D iteration space from a COO tensor at level 1. + %space = sparse_tensor.iteration.extract_space %sp at %it1 lvls = 1 + : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> + ``` + }]; + + + let extraClassDeclaration = [{ + std::pair getLvlRange() { + return std::make_pair(getLoLvl(), getHiLvl()); + } + unsigned getSpaceDim() { + return getHiLvl() - getLoLvl(); + } + ArrayRef<::mlir::sparse_tensor::LevelType> getSpaceLvlTypes() { + return getResultSpace().getType().getLvlTypes(); + } + }]; + + let hasVerifier = 1; + let assemblyFormat = "$tensor (`at` $parentIter^)? `lvls` `=` custom($loLvl, $hiLvl) " + " attr-dict `:` type($tensor) (`,` type($parentIter)^)?"; +} + +def IterateOp : SparseTensor_Op<"iterate", + [RecursiveMemoryEffects, RecursivelySpeculatable, + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, + SingleBlockImplicitTerminator<"sparse_tensor::YieldOp">]> { + + let arguments = (ins AnySparseIterSpace:$iterSpace, + Variadic:$initArgs, + LevelSetAttr:$crdUsedLvls); + let results = (outs Variadic:$results); + let regions = (region SizedRegion<1>:$region); + + let summary = "Iterate over a sparse iteration space"; + let description = [{ + The `sparse_tensor.iterate` operations represents a loop over the + provided iteration space extracted from a specific sparse tensor. + The operation defines an SSA value for a sparse iterator that points + to the current stored element in the sparse tensor and SSA values + for coordinates of the stored element. The coordinates are always + converted to `index` type despite of the underlying sparse tensor + storage. When coordinates are not used, the SSA values can be skipped + by `_` symbols, which usually leads to simpler generated code after + sparsification. For example: + + ```mlir + // The coordinate for level 0 is not used when iterating over a 2-D + // iteration space. + %sparse_tensor.iterate %iterator in %space at(_, %crd_1) + : !sparse_tensor.iter_space<#CSR, lvls = 0 to 2> + ``` + + `sparse_tensor.iterate` can also operate on loop-carried variables + and returns the final values after loop termination. + The initial values of the variables are passed as additional SSA operands + to the iterator SSA value and used coordinate SSA values mentioned + above. The operation region has an argument for the iterator, variadic + arguments for specified (used) coordiates and followed by one argument + for each loop-carried variable, representing the value of the variable + at the current iteration. + The body region must contain exactly one block that terminates with + `sparse_tensor.yield`. + + `sparse_tensor.iterate` results hold the final values after the last + iteration. If the `sparse_tensor.iterate` defines any values, a yield + must be explicitly present. + The number and types of the `sparse_tensor.iterate` results must match + the initial values in the iter_args binding and the yield operands. + + + A nested `sparse_tensor.iterate` example that prints all the coordinates + stored in the sparse input: + + ```mlir + func.func @nested_iterate(%sp : tensor<4x8xf32, #COO>) { + // Iterates over the first level of %sp + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> + %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd0) + : !sparse_tensor.iter_space<#COO, lvls = 0 to 1> { + // Iterates over the second level of %sp + %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 + : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0 to 1> + %r2 = sparse_tensor.iterate %it2 in %l2 at (crd1) + : !sparse_tensor.iter_space<#COO, lvls = 1 to 2> { + vector.print %crd0 : index + vector.print %crd1 : index + } + } + } + + ``` + }]; + + let extraClassDeclaration = [{ + unsigned getSpaceDim() { + return getIterSpace().getType().getSpaceDim(); + } + BlockArgument getIterator() { + return getRegion().getArguments().front(); + } + Block::BlockArgListType getCrds() { + // The first block argument is iterator, the remaining arguments are + // referenced coordinates. + return getRegion().getArguments().slice(1, getCrdUsedLvls().count()); + } + unsigned getNumRegionIterArgs() { + return getRegion().getArguments().size() - 1 - getCrdUsedLvls().count(); + } + }]; + + let hasVerifier = 1; + let hasRegionVerifier = 1; + let hasCustomAssemblyFormat = 1; +} + //===----------------------------------------------------------------------===// // Sparse Tensor Debugging and Test-Only Operations. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td index 185cff46ae25d5..264a0a5b3bee6c 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td @@ -72,4 +72,99 @@ def SparseTensorStorageSpecifier : Type($_self)">, "metadata", "::mlir::sparse_tensor::StorageSpecifierType">; +//===----------------------------------------------------------------------===// +// Sparse Tensor Iteration Types. +//===----------------------------------------------------------------------===// + +def SparseTensor_IterSpace : SparseTensor_Type<"IterSpace"> { + let mnemonic = "iter_space"; + + let description = [{ + A sparse iteration space that represents an abstract N-D (sparse) iteration space + extracted from a sparse tensor. + + Examples: + + ```mlir + // An iteration space extracted from a CSR tensor between levels [0, 2). + !iter_space<#CSR, lvls = 0 to 2> + ``` + }]; + + let parameters = (ins + SparseTensorEncodingAttr : $encoding, + "Level" : $loLvl, + "Level" : $hiLvl + ); + + let extraClassDeclaration = [{ + /// The the dimension of the iteration space. + unsigned getSpaceDim() const { + return getHiLvl() - getLoLvl(); + } + + /// Get the level types for the iteration space. + ArrayRef getLvlTypes() const { + return getEncoding().getLvlTypes().slice(getLoLvl(), getSpaceDim()); + } + + /// Whether the iteration space is unique (i.e., no duplicated coordinate). + bool isUnique() { + return !getLvlTypes().back().isa(); + } + + /// Get the corresponding iterator type. + ::mlir::sparse_tensor::IteratorType getIteratorType() const; + }]; + + let assemblyFormat="`<` $encoding `,` `lvls` `=` custom($loLvl, $hiLvl) `>`"; +} + +def SparseTensor_Iterator : SparseTensor_Type<"Iterator"> { + let mnemonic = "iterator"; + + let description = [{ + An iterator that points to the current element in the corresponding iteration space. + + Examples: + + ```mlir + // An iterator that iterates over a iteration space of type `!iter_space<#CSR, lvls = 0 to 2>` + !iterator<#CSR, lvls = 0 to 2> + ``` + }]; + + let parameters = (ins + SparseTensorEncodingAttr : $encoding, + "Level" : $loLvl, + "Level" : $hiLvl + ); + + let extraClassDeclaration = [{ + /// Get the corresponding iteration space type. + ::mlir::sparse_tensor::IterSpaceType getIterSpaceType() const; + + unsigned getSpaceDim() const { return getIterSpaceType().getSpaceDim(); } + ArrayRef getLvlTypes() const { return getIterSpaceType().getLvlTypes(); } + bool isUnique() { return getIterSpaceType().isUnique(); } + }]; + + let assemblyFormat="`<` $encoding `,` `lvls` `=` custom($loLvl, $hiLvl) `>`"; +} + +def IsSparseSparseIterSpaceTypePred + : CPred<"::llvm::isa<::mlir::sparse_tensor::IterSpaceType>($_self)">; + +def IsSparseSparseIteratorTypePred + : CPred<"::llvm::isa<::mlir::sparse_tensor::IteratorType>($_self)">; + +def AnySparseIterSpace + : Type; + +def AnySparseIterator + : Type; + + #endif // SPARSETENSOR_TYPES diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index e9058394d33da5..36908def09f403 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -30,6 +30,14 @@ #include "mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.cpp.inc" #include "mlir/Dialect/SparseTensor/IR/SparseTensorAttrEnums.cpp.inc" +// Forward declarations, following custom print/parsing methods are referenced +// by the generated code for SparseTensorTypes.td. +static mlir::ParseResult parseLevelRange(mlir::AsmParser &, + mlir::sparse_tensor::Level &, + mlir::sparse_tensor::Level &); +static void printLevelRange(mlir::AsmPrinter &, mlir::sparse_tensor::Level, + mlir::sparse_tensor::Level); + #define GET_TYPEDEF_CLASSES #include "mlir/Dialect/SparseTensor/IR/SparseTensorTypes.cpp.inc" @@ -1953,6 +1961,363 @@ LogicalResult SortOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// Sparse Tensor Iteration Operations. +//===----------------------------------------------------------------------===// + +IterSpaceType IteratorType::getIterSpaceType() const { + return IterSpaceType::get(getContext(), getEncoding(), getLoLvl(), + getHiLvl()); +} + +IteratorType IterSpaceType::getIteratorType() const { + return IteratorType::get(getContext(), getEncoding(), getLoLvl(), getHiLvl()); +} + +/// Parses a level range in the form "$lo `to` $hi" +/// or simply "$lo" if $hi - $lo = 1 +static ParseResult parseLevelRange(AsmParser &parser, Level &lvlLo, + Level &lvlHi) { + if (parser.parseInteger(lvlLo)) + return failure(); + + if (succeeded(parser.parseOptionalKeyword("to"))) { + if (parser.parseInteger(lvlHi)) + return failure(); + } else { + lvlHi = lvlLo + 1; + } + + if (lvlHi <= lvlLo) + parser.emitError(parser.getNameLoc(), + "expect larger level upper bound than lower bound"); + + return success(); +} + +/// Parses a level range in the form "$lo `to` $hi" +/// or simply "$lo" if $hi - $lo = 1 +static ParseResult parseLevelRange(OpAsmParser &parser, IntegerAttr &lvlLoAttr, + IntegerAttr &lvlHiAttr) { + Level lvlLo, lvlHi; + if (parseLevelRange(parser, lvlLo, lvlHi)) + return failure(); + + lvlLoAttr = IntegerAttr::get(parser.getBuilder().getIndexType(), lvlLo); + lvlHiAttr = IntegerAttr::get(parser.getBuilder().getIndexType(), lvlHi); + return success(); +} + +/// Prints a level range in the form "$lo `to` $hi" +/// or simply "$lo" if $hi - $lo = 1 +static void printLevelRange(AsmPrinter &p, Level lo, Level hi) { + + if (lo + 1 == hi) + p << lo; + else + p << lo << " to " << hi; +} + +/// Prints a level range in the form "$lo `to` $hi" +/// or simply "$lo" if $hi - $lo = 1 +static void printLevelRange(OpAsmPrinter &p, Operation *, IntegerAttr lvlLo, + IntegerAttr lvlHi) { + unsigned lo = lvlLo.getValue().getZExtValue(); + unsigned hi = lvlHi.getValue().getZExtValue(); + printLevelRange(p, lo, hi); +} + +static ParseResult +parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state, + SmallVectorImpl &iterators, + SmallVectorImpl &iterArgs) { + SmallVector spaces; + SmallVector initArgs; + + // Parses "%iters, ... in %spaces, ..." + if (parser.parseArgumentList(iterators) || parser.parseKeyword("in") || + parser.parseOperandList(spaces)) + return failure(); + + if (iterators.size() != spaces.size()) + return parser.emitError( + parser.getNameLoc(), + "mismatch in number of sparse iterators and sparse spaces"); + + // Parse "at(%crd0, _, ...)" + LevelSet crdUsedLvlSet; + bool hasUsedCrds = succeeded(parser.parseOptionalKeyword("at")); + unsigned lvlCrdCnt = 0; + if (hasUsedCrds) { + ParseResult crdList = parser.parseCommaSeparatedList( + OpAsmParser::Delimiter::Paren, [&]() -> ParseResult { + if (parser.parseOptionalKeyword("_")) { + if (parser.parseArgument(iterArgs.emplace_back())) + return failure(); + // Always use IndexType for the coordinate. + crdUsedLvlSet.set(lvlCrdCnt); + iterArgs.back().type = parser.getBuilder().getIndexType(); + } + lvlCrdCnt += 1; + return success(); + }); + if (failed(crdList)) { + return parser.emitError( + parser.getNameLoc(), + "expecting SSA value or \"_\" for level coordinates"); + } + } + // Set the CrdUsedLvl bitset. + state.addAttribute("crdUsedLvls", + parser.getBuilder().getI64IntegerAttr(crdUsedLvlSet)); + + // Parse "iter_args(%arg = %init, ...)" + bool hasIterArgs = succeeded(parser.parseOptionalKeyword("iter_args")); + if (hasIterArgs) + if (parser.parseAssignmentList(iterArgs, initArgs)) + return failure(); + + SmallVector iterSpaceTps; + // parse ": sparse_tensor.iter_space -> ret" + if (parser.parseColon() || parser.parseTypeList(iterSpaceTps)) + return failure(); + if (iterSpaceTps.size() != spaces.size()) + return parser.emitError(parser.getNameLoc(), + "mismatch in number of iteration space operands " + "and iteration space types"); + + for (auto [it, tp] : llvm::zip_equal(iterators, iterSpaceTps)) { + IterSpaceType spaceTp = llvm::dyn_cast(tp); + if (!spaceTp) + return parser.emitError(parser.getNameLoc(), + "expected sparse_tensor.iter_space type for " + "iteration space operands"); + if (hasUsedCrds && spaceTp.getSpaceDim() != lvlCrdCnt) + return parser.emitError(parser.getNameLoc(), + "mismatch in number of iteration space dimension " + "and specified coordinates"); + it.type = spaceTp.getIteratorType(); + } + + if (hasIterArgs) + if (parser.parseArrowTypeList(state.types)) + return failure(); + + // Resolves input operands. + if (parser.resolveOperands(spaces, iterSpaceTps, parser.getNameLoc(), + state.operands)) + return failure(); + + if (hasIterArgs) { + unsigned numCrds = crdUsedLvlSet.count(); + // Strip off leading args that used for coordinates. + MutableArrayRef args = MutableArrayRef(iterArgs).drop_front(numCrds); + if (args.size() != initArgs.size() || args.size() != state.types.size()) { + return parser.emitError( + parser.getNameLoc(), + "mismatch in number of iteration arguments and return values"); + } + + for (auto [it, init, tp] : llvm::zip_equal(args, initArgs, state.types)) { + it.type = tp; + if (parser.resolveOperand(init, tp, state.operands)) + return failure(); + } + } + return success(); +} + +LogicalResult ExtractIterSpaceOp::inferReturnTypes( + MLIRContext *ctx, std::optional loc, ValueRange ops, + DictionaryAttr attr, OpaqueProperties prop, RegionRange region, + SmallVectorImpl &ret) { + + ExtractIterSpaceOp::Adaptor adaptor(ops, attr, prop, region); + SparseTensorType stt = getSparseTensorType(adaptor.getTensor()); + ret.push_back(IterSpaceType::get(ctx, stt.getEncoding(), adaptor.getLoLvl(), + adaptor.getHiLvl())); + return success(); +} + +LogicalResult ExtractIterSpaceOp::verify() { + if (getLoLvl() >= getHiLvl()) + return emitOpError("expected smaller level low than level high"); + + TypedValue pIter = getParentIter(); + if ((pIter && getLoLvl() == 0) || (!pIter && getLoLvl() != 0)) { + return emitOpError( + "parent iterator should be specified iff level lower bound equals 0"); + } + + if (pIter) { + IterSpaceType spaceTp = getResultSpace().getType(); + if (pIter.getType().getEncoding() != spaceTp.getEncoding()) + return emitOpError( + "mismatch in parent iterator encoding and iteration space encoding."); + + if (spaceTp.getLoLvl() != pIter.getType().getHiLvl()) + return emitOpError("parent iterator should be used to extract an " + "iteration space from a consecutive level."); + } + + return success(); +} + +ParseResult IterateOp::parse(OpAsmParser &parser, OperationState &result) { + OpAsmParser::Argument iterator; + OpAsmParser::UnresolvedOperand iterSpace; + + SmallVector iters, iterArgs; + if (parseSparseSpaceLoop(parser, result, iters, iterArgs)) + return failure(); + if (iters.size() != 1) + return parser.emitError(parser.getNameLoc(), + "expected only one iterator/iteration space"); + + iters.append(iterArgs); + Region *body = result.addRegion(); + if (parser.parseRegion(*body, iters)) + return failure(); + + IterateOp::ensureTerminator(*body, parser.getBuilder(), result.location); + + // Parse the optional attribute list. + if (parser.parseOptionalAttrDict(result.attributes)) + return failure(); + + return success(); +} + +/// Prints the initialization list in the form of +/// (%inner = %outer, %inner2 = %outer2, <...>) +/// where 'inner' values are assumed to be region arguments and 'outer' values +/// are regular SSA values. +static void printInitializationList(OpAsmPrinter &p, + Block::BlockArgListType blocksArgs, + ValueRange initializers, + StringRef prefix = "") { + assert(blocksArgs.size() == initializers.size() && + "expected same length of arguments and initializers"); + if (initializers.empty()) + return; + + p << prefix << '('; + llvm::interleaveComma(llvm::zip(blocksArgs, initializers), p, [&](auto it) { + p << std::get<0>(it) << " = " << std::get<1>(it); + }); + p << ")"; +} + +static void printUsedCrdsList(OpAsmPrinter &p, unsigned spaceDim, + Block::BlockArgListType blocksArgs, + LevelSet crdUsedLvls) { + if (crdUsedLvls.empty()) + return; + + p << " at("; + for (unsigned i = 0; i < spaceDim; i++) { + if (crdUsedLvls[i]) { + p << blocksArgs.front(); + blocksArgs = blocksArgs.drop_front(); + } else { + p << "_"; + } + if (i != spaceDim - 1) + p << ", "; + } + assert(blocksArgs.empty()); + p << ")"; +} + +void IterateOp::print(OpAsmPrinter &p) { + p << " " << getIterator() << " in " << getIterSpace(); + printUsedCrdsList(p, getSpaceDim(), getCrds(), getCrdUsedLvls()); + printInitializationList(p, getRegionIterArgs(), getInitArgs(), " iter_args"); + + p << " : " << getIterSpace().getType() << " "; + if (!getInitArgs().empty()) + p << "-> (" << getInitArgs().getTypes() << ") "; + + p.printRegion(getRegion(), /*printEntryBlockArgs=*/false, + /*printBlockTerminators=*/!getInitArgs().empty()); +} + +LogicalResult IterateOp::verify() { + if (getInitArgs().size() != getNumResults()) { + return emitOpError( + "mismatch in number of loop-carried values and defined values"); + } + return success(); +} + +LogicalResult IterateOp::verifyRegions() { + if (getIterator().getType() != getIterSpace().getType().getIteratorType()) + return emitOpError("mismatch in iterator and iteration space type"); + if (getNumRegionIterArgs() != getNumResults()) + return emitOpError( + "mismatch in number of basic block args and defined values"); + + auto initArgs = getInitArgs(); + auto iterArgs = getRegionIterArgs(); + auto yieldVals = getYieldedValues(); + auto opResults = getResults(); + if (!llvm::all_equal({initArgs.size(), iterArgs.size(), yieldVals.size(), + opResults.size()})) { + return emitOpError() << "number mismatch between iter args and results."; + } + + for (auto [i, init, iter, yield, ret] : + llvm::enumerate(initArgs, iterArgs, yieldVals, opResults)) { + if (init.getType() != ret.getType()) + return emitOpError() << "types mismatch between " << i + << "th iter operand and defined value"; + if (iter.getType() != ret.getType()) + return emitOpError() << "types mismatch between " << i + << "th iter region arg and defined value"; + if (yield.getType() != ret.getType()) + return emitOpError() << "types mismatch between " << i + << "th yield value and defined value"; + } + + return success(); +} + +/// IterateOp implemented OpInterfaces' methods. +SmallVector IterateOp::getLoopRegions() { return {&getRegion()}; } + +MutableArrayRef IterateOp::getInitsMutable() { + return getInitArgsMutable(); +} + +Block::BlockArgListType IterateOp::getRegionIterArgs() { + return getRegion().getArguments().take_back(getNumRegionIterArgs()); +} + +std::optional> IterateOp::getYieldedValuesMutable() { + return cast( + getRegion().getBlocks().front().getTerminator()) + .getResultsMutable(); +} + +std::optional IterateOp::getLoopResults() { return getResults(); } + +OperandRange IterateOp::getEntrySuccessorOperands(RegionBranchPoint point) { + return getInitArgs(); +} + +void IterateOp::getSuccessorRegions(RegionBranchPoint point, + SmallVectorImpl ®ions) { + // Both the operation itself and the region may be branching into the body or + // back into the operation itself. + regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); + // It is possible for loop not to enter the body. + regions.push_back(RegionSuccessor(getResults())); +} + +//===----------------------------------------------------------------------===// +// Sparse Tensor Dialect Setups. +//===----------------------------------------------------------------------===// + /// Materialize a single constant operation from a given attribute value with /// the desired resultant type. Operation *SparseTensorDialect::materializeConstant(OpBuilder &builder, diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir index 7f5c05190fc9a2..b13024cd4ed99d 100644 --- a/mlir/test/Dialect/SparseTensor/invalid.mlir +++ b/mlir/test/Dialect/SparseTensor/invalid.mlir @@ -1012,3 +1012,142 @@ func.func @sparse_print(%arg0: tensor<10x10xf64>) { sparse_tensor.print %arg0 : tensor<10x10xf64> return } + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 2>) { + // expected-error@+1 {{'sparse_tensor.extract_iteration_space' expect larger level upper bound than lower bound}} + %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 2 to 0 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 2> + return +} + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) { + // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be specified iff level lower bound equals 0}} + %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 0 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> + return +} + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>) { + // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be specified iff level lower bound equals 0}} + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 1 : tensor<4x8xf32, #COO> + return +} + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +#CSR = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : dense, + j : compressed + ) +}> + +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#CSR, lvls = 0>) { + // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op mismatch in parent iterator encoding and iteration space encoding.}} + %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#CSR, lvls = 0> + return +} + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) { + // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be used to extract an iteration space from a consecutive level.}} + %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 2 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> + return +} + + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -> index { + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> + // expected-error @+1 {{'sparse_tensor.iterate' op different number of region iter_args and yielded values: 2 != 1}} + %r1, %r2 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%si = %i, %sj = %j): !sparse_tensor.iter_space<#COO, lvls = 0> -> (index, index) { + sparse_tensor.yield %si : index + } + return %r1 : index +} + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +// expected-note@+1 {{prior use here}} +func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index) -> f32 { + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> + // expected-error @+1 {{use of value '%i' expects different type than prior uses: 'f32' vs 'index'}} + %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%outer = %i): !sparse_tensor.iter_space<#COO, lvls = 0> -> f32 { + sparse_tensor.yield %outer : f32 + } + return %r1 : f32 +} + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -> index { + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> + // expected-error @+1 {{'sparse_tensor.iterate' op 0-th region iter_arg and 0-th yielded value have different type: 'index' != 'f32'}} + %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%si = %i): !sparse_tensor.iter_space<#COO, lvls = 0> -> index { + %y = arith.constant 1.0 : f32 + sparse_tensor.yield %y : f32 + } + return %r1 : index +} diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir index 12f69c1d37b9cd..e9a898f16b41d2 100644 --- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir +++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir @@ -738,3 +738,56 @@ func.func @sparse_has_runtime() -> i1 { %has_runtime = sparse_tensor.has_runtime_library return %has_runtime : i1 } + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +// CHECK-LABEL: func.func @sparse_extract_iter_space( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<4x8xf32, #sparse{{[0-9]*}}>, +// CHECK-SAME: %[[VAL_1:.*]]: !sparse_tensor.iterator<#sparse{{[0-9]*}}, lvls = 0>) +// CHECK: %[[VAL_2:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] lvls = 0 +// CHECK: %[[VAL_3:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] at %[[VAL_1]] lvls = 1 +// CHECK: return %[[VAL_2]], %[[VAL_3]] : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0>, !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 1> +// CHECK: } +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) + -> (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>) { + // Extracting the iteration space for the first level needs no parent iterator. + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> + // Extracting the iteration space for the second level needs a parent iterator. + %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> + return %l1, %l2 : !sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1> +} + + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +// CHECK-LABEL: func.func @sparse_iterate( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<4x8xf32, #sparse{{[0-9]*}}>, +// CHECK-SAME: %[[VAL_1:.*]]: index, +// CHECK-SAME: %[[VAL_2:.*]]: index) -> index { +// CHECK: %[[VAL_3:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] lvls = 0 : tensor<4x8xf32, #sparse{{[0-9]*}}> +// CHECK: %[[VAL_4:.*]] = sparse_tensor.iterate %[[VAL_5:.*]] in %[[VAL_3]] at(%[[VAL_6:.*]]) iter_args(%[[VAL_7:.*]] = %[[VAL_1]]) : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0> -> (index) { +// CHECK: sparse_tensor.yield %[[VAL_7]] : index +// CHECK: } +// CHECK: return %[[VAL_4]] : index +// CHECK: } +func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -> index { + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> + %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%outer = %i): !sparse_tensor.iter_space<#COO, lvls = 0 to 1> -> index { + sparse_tensor.yield %outer : index + } + return %r1 : index +} diff --git a/mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir b/mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir new file mode 100644 index 00000000000000..e7158d04b37feb --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir @@ -0,0 +1,26 @@ +// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s + +#CSR = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : dense, + j : compressed + ) +}> + +// Make sure that pure instructions are hoisted outside the loop. +// +// CHECK: sparse_tensor.values +// CHECK: sparse_tensor.positions +// CHECK: sparse_tensor.coordinate +// CHECK: sparse_tensor.iterate +func.func @sparse_iterate(%sp : tensor) { + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor + sparse_tensor.iterate %it1 in %l1 at (%crd) : !sparse_tensor.iter_space<#CSR, lvls = 0> { + %0 = sparse_tensor.values %sp : tensor to memref + %1 = sparse_tensor.positions %sp { level = 1 : index } : tensor to memref + %2 = sparse_tensor.coordinates %sp { level = 1 : index } : tensor to memref + "test.op"(%0, %1, %2) : (memref, memref, memref) -> () + } + + return +} From b9556532c7391a2acb77ab0f7d7b36e1dc382b24 Mon Sep 17 00:00:00 2001 From: Peiming Liu Date: Tue, 16 Apr 2024 11:31:33 -0700 Subject: [PATCH 147/300] Revert "[mlir][sparse] introduce sparse_tensor.iterate operation" (#88953) Reverts llvm/llvm-project#88807 (merged by mistake) --- .../Dialect/SparseTensor/IR/SparseTensor.h | 38 -- .../SparseTensor/IR/SparseTensorAttrDefs.td | 15 - .../SparseTensor/IR/SparseTensorOps.td | 152 +------- .../SparseTensor/IR/SparseTensorTypes.td | 95 ----- .../SparseTensor/IR/SparseTensorDialect.cpp | 365 ------------------ mlir/test/Dialect/SparseTensor/invalid.mlir | 139 ------- mlir/test/Dialect/SparseTensor/roundtrip.mlir | 53 --- .../SparseTensor/sparse_itertion_licm.mlir | 26 -- 8 files changed, 1 insertion(+), 882 deletions(-) delete mode 100644 mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h index 081a9b8cad8d62..5e523ec428aefb 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h @@ -17,13 +17,9 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/IR/OpImplementation.h" #include "mlir/IR/TensorEncoding.h" -#include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/InferTypeOpInterface.h" -#include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -#include "llvm/ADT/bit.h" - //===----------------------------------------------------------------------===// // // Type aliases to help code be more self-documenting. Unfortunately @@ -45,40 +41,6 @@ using Level = uint64_t; /// including the value `ShapedType::kDynamic` (for shapes). using Size = int64_t; -/// A simple wrapper to encode a bitset of defined (at most 64) levels. -class LevelSet { - uint64_t bits = 0; - -public: - LevelSet() = default; - explicit LevelSet(uint64_t bits) : bits(bits) {} - operator uint64_t() const { return bits; } - - LevelSet &set(unsigned i) { - assert(i < 64); - bits |= 1 << i; - return *this; - } - - LevelSet &operator|=(LevelSet lhs) { - bits |= static_cast(lhs); - return *this; - } - - LevelSet &lshift(unsigned offset) { - bits = bits << offset; - return *this; - } - - bool operator[](unsigned i) const { - assert(i < 64); - return (bits & (1 << i)) != 0; - } - - unsigned count() const { return llvm::popcount(bits); } - bool empty() const { return bits == 0; } -}; - } // namespace sparse_tensor } // namespace mlir diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td index d5398a98f5b171..4a9b9169ae4b86 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td @@ -19,21 +19,6 @@ class SparseTensor_Attr traits = []> : AttrDef; -//===----------------------------------------------------------------------===// -// A simple bitset attribute wrapped over a single int64_t to encode a set of -// sparse tensor levels. -//===----------------------------------------------------------------------===// - -def LevelSetAttr : - TypedAttrBase< - I64, "IntegerAttr", - And<[CPred<"::llvm::isa<::mlir::IntegerAttr>($_self)">, - CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getType().isInteger(64)">]>, - "LevelSet attribute"> { - let returnType = [{::mlir::sparse_tensor::LevelSet}]; - let convertFromStorage = [{::mlir::sparse_tensor::LevelSet($_self.getValue().getZExtValue())}]; -} - //===----------------------------------------------------------------------===// // These attributes are just like `IndexAttr` except that they clarify whether // the index refers to a dimension (an axis of the semantic tensor) or a level diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td index b43d716d5e8642..0cfc64f9988a0a 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td @@ -15,8 +15,6 @@ include "mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td" include "mlir/Dialect/SparseTensor/IR/SparseTensorInterfaces.td" include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/SideEffectInterfaces.td" -include "mlir/Interfaces/ControlFlowInterfaces.td" -include "mlir/Interfaces/LoopLikeInterface.td" //===----------------------------------------------------------------------===// // Base class. @@ -1279,7 +1277,7 @@ def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResu def SparseTensor_YieldOp : SparseTensor_Op<"yield", [Pure, Terminator, ParentOneOf<["BinaryOp", "UnaryOp", "ReduceOp", "SelectOp", - "ForeachOp", "IterateOp"]>]>, + "ForeachOp"]>]>, Arguments<(ins Variadic:$results)> { let summary = "Yield from sparse_tensor set-like operations"; let description = [{ @@ -1432,154 +1430,6 @@ def SparseTensor_ForeachOp : SparseTensor_Op<"foreach", let hasVerifier = 1; } -//===----------------------------------------------------------------------===// -// Sparse Tensor Iteration Operations. -//===----------------------------------------------------------------------===// - -def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space", - [Pure, DeclareOpInterfaceMethods]> { - - let arguments = (ins AnySparseTensor:$tensor, - Optional:$parentIter, - LevelAttr:$loLvl, LevelAttr:$hiLvl); - - let results = (outs AnySparseIterSpace:$resultSpace); - - let summary = "Extract an iteration space from a sparse tensor between certain levels"; - let description = [{ - Extracts a `!sparse_tensor.iter_space` from a sparse tensor between - certian (consecutive) levels. - - `tensor`: the input sparse tensor that defines the iteration space. - `parentIter`: the iterator for the previous level, at which the iteration space - at the current levels will be extracted. - `loLvl`, `hiLvl`: the level range between [loLvl, hiLvl) in the input tensor that - the returned iteration space covers. `hiLvl - loLvl` defines the dimension of the - iteration space. - - Example: - ```mlir - // Extracts a 1-D iteration space from a COO tensor at level 1. - %space = sparse_tensor.iteration.extract_space %sp at %it1 lvls = 1 - : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> - ``` - }]; - - - let extraClassDeclaration = [{ - std::pair getLvlRange() { - return std::make_pair(getLoLvl(), getHiLvl()); - } - unsigned getSpaceDim() { - return getHiLvl() - getLoLvl(); - } - ArrayRef<::mlir::sparse_tensor::LevelType> getSpaceLvlTypes() { - return getResultSpace().getType().getLvlTypes(); - } - }]; - - let hasVerifier = 1; - let assemblyFormat = "$tensor (`at` $parentIter^)? `lvls` `=` custom($loLvl, $hiLvl) " - " attr-dict `:` type($tensor) (`,` type($parentIter)^)?"; -} - -def IterateOp : SparseTensor_Op<"iterate", - [RecursiveMemoryEffects, RecursivelySpeculatable, - DeclareOpInterfaceMethods, - DeclareOpInterfaceMethods, - SingleBlockImplicitTerminator<"sparse_tensor::YieldOp">]> { - - let arguments = (ins AnySparseIterSpace:$iterSpace, - Variadic:$initArgs, - LevelSetAttr:$crdUsedLvls); - let results = (outs Variadic:$results); - let regions = (region SizedRegion<1>:$region); - - let summary = "Iterate over a sparse iteration space"; - let description = [{ - The `sparse_tensor.iterate` operations represents a loop over the - provided iteration space extracted from a specific sparse tensor. - The operation defines an SSA value for a sparse iterator that points - to the current stored element in the sparse tensor and SSA values - for coordinates of the stored element. The coordinates are always - converted to `index` type despite of the underlying sparse tensor - storage. When coordinates are not used, the SSA values can be skipped - by `_` symbols, which usually leads to simpler generated code after - sparsification. For example: - - ```mlir - // The coordinate for level 0 is not used when iterating over a 2-D - // iteration space. - %sparse_tensor.iterate %iterator in %space at(_, %crd_1) - : !sparse_tensor.iter_space<#CSR, lvls = 0 to 2> - ``` - - `sparse_tensor.iterate` can also operate on loop-carried variables - and returns the final values after loop termination. - The initial values of the variables are passed as additional SSA operands - to the iterator SSA value and used coordinate SSA values mentioned - above. The operation region has an argument for the iterator, variadic - arguments for specified (used) coordiates and followed by one argument - for each loop-carried variable, representing the value of the variable - at the current iteration. - The body region must contain exactly one block that terminates with - `sparse_tensor.yield`. - - `sparse_tensor.iterate` results hold the final values after the last - iteration. If the `sparse_tensor.iterate` defines any values, a yield - must be explicitly present. - The number and types of the `sparse_tensor.iterate` results must match - the initial values in the iter_args binding and the yield operands. - - - A nested `sparse_tensor.iterate` example that prints all the coordinates - stored in the sparse input: - - ```mlir - func.func @nested_iterate(%sp : tensor<4x8xf32, #COO>) { - // Iterates over the first level of %sp - %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> - %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd0) - : !sparse_tensor.iter_space<#COO, lvls = 0 to 1> { - // Iterates over the second level of %sp - %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 - : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0 to 1> - %r2 = sparse_tensor.iterate %it2 in %l2 at (crd1) - : !sparse_tensor.iter_space<#COO, lvls = 1 to 2> { - vector.print %crd0 : index - vector.print %crd1 : index - } - } - } - - ``` - }]; - - let extraClassDeclaration = [{ - unsigned getSpaceDim() { - return getIterSpace().getType().getSpaceDim(); - } - BlockArgument getIterator() { - return getRegion().getArguments().front(); - } - Block::BlockArgListType getCrds() { - // The first block argument is iterator, the remaining arguments are - // referenced coordinates. - return getRegion().getArguments().slice(1, getCrdUsedLvls().count()); - } - unsigned getNumRegionIterArgs() { - return getRegion().getArguments().size() - 1 - getCrdUsedLvls().count(); - } - }]; - - let hasVerifier = 1; - let hasRegionVerifier = 1; - let hasCustomAssemblyFormat = 1; -} - //===----------------------------------------------------------------------===// // Sparse Tensor Debugging and Test-Only Operations. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td index 264a0a5b3bee6c..185cff46ae25d5 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td @@ -72,99 +72,4 @@ def SparseTensorStorageSpecifier : Type($_self)">, "metadata", "::mlir::sparse_tensor::StorageSpecifierType">; -//===----------------------------------------------------------------------===// -// Sparse Tensor Iteration Types. -//===----------------------------------------------------------------------===// - -def SparseTensor_IterSpace : SparseTensor_Type<"IterSpace"> { - let mnemonic = "iter_space"; - - let description = [{ - A sparse iteration space that represents an abstract N-D (sparse) iteration space - extracted from a sparse tensor. - - Examples: - - ```mlir - // An iteration space extracted from a CSR tensor between levels [0, 2). - !iter_space<#CSR, lvls = 0 to 2> - ``` - }]; - - let parameters = (ins - SparseTensorEncodingAttr : $encoding, - "Level" : $loLvl, - "Level" : $hiLvl - ); - - let extraClassDeclaration = [{ - /// The the dimension of the iteration space. - unsigned getSpaceDim() const { - return getHiLvl() - getLoLvl(); - } - - /// Get the level types for the iteration space. - ArrayRef getLvlTypes() const { - return getEncoding().getLvlTypes().slice(getLoLvl(), getSpaceDim()); - } - - /// Whether the iteration space is unique (i.e., no duplicated coordinate). - bool isUnique() { - return !getLvlTypes().back().isa(); - } - - /// Get the corresponding iterator type. - ::mlir::sparse_tensor::IteratorType getIteratorType() const; - }]; - - let assemblyFormat="`<` $encoding `,` `lvls` `=` custom($loLvl, $hiLvl) `>`"; -} - -def SparseTensor_Iterator : SparseTensor_Type<"Iterator"> { - let mnemonic = "iterator"; - - let description = [{ - An iterator that points to the current element in the corresponding iteration space. - - Examples: - - ```mlir - // An iterator that iterates over a iteration space of type `!iter_space<#CSR, lvls = 0 to 2>` - !iterator<#CSR, lvls = 0 to 2> - ``` - }]; - - let parameters = (ins - SparseTensorEncodingAttr : $encoding, - "Level" : $loLvl, - "Level" : $hiLvl - ); - - let extraClassDeclaration = [{ - /// Get the corresponding iteration space type. - ::mlir::sparse_tensor::IterSpaceType getIterSpaceType() const; - - unsigned getSpaceDim() const { return getIterSpaceType().getSpaceDim(); } - ArrayRef getLvlTypes() const { return getIterSpaceType().getLvlTypes(); } - bool isUnique() { return getIterSpaceType().isUnique(); } - }]; - - let assemblyFormat="`<` $encoding `,` `lvls` `=` custom($loLvl, $hiLvl) `>`"; -} - -def IsSparseSparseIterSpaceTypePred - : CPred<"::llvm::isa<::mlir::sparse_tensor::IterSpaceType>($_self)">; - -def IsSparseSparseIteratorTypePred - : CPred<"::llvm::isa<::mlir::sparse_tensor::IteratorType>($_self)">; - -def AnySparseIterSpace - : Type; - -def AnySparseIterator - : Type; - - #endif // SPARSETENSOR_TYPES diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index 36908def09f403..e9058394d33da5 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -30,14 +30,6 @@ #include "mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.cpp.inc" #include "mlir/Dialect/SparseTensor/IR/SparseTensorAttrEnums.cpp.inc" -// Forward declarations, following custom print/parsing methods are referenced -// by the generated code for SparseTensorTypes.td. -static mlir::ParseResult parseLevelRange(mlir::AsmParser &, - mlir::sparse_tensor::Level &, - mlir::sparse_tensor::Level &); -static void printLevelRange(mlir::AsmPrinter &, mlir::sparse_tensor::Level, - mlir::sparse_tensor::Level); - #define GET_TYPEDEF_CLASSES #include "mlir/Dialect/SparseTensor/IR/SparseTensorTypes.cpp.inc" @@ -1961,363 +1953,6 @@ LogicalResult SortOp::verify() { return success(); } -//===----------------------------------------------------------------------===// -// Sparse Tensor Iteration Operations. -//===----------------------------------------------------------------------===// - -IterSpaceType IteratorType::getIterSpaceType() const { - return IterSpaceType::get(getContext(), getEncoding(), getLoLvl(), - getHiLvl()); -} - -IteratorType IterSpaceType::getIteratorType() const { - return IteratorType::get(getContext(), getEncoding(), getLoLvl(), getHiLvl()); -} - -/// Parses a level range in the form "$lo `to` $hi" -/// or simply "$lo" if $hi - $lo = 1 -static ParseResult parseLevelRange(AsmParser &parser, Level &lvlLo, - Level &lvlHi) { - if (parser.parseInteger(lvlLo)) - return failure(); - - if (succeeded(parser.parseOptionalKeyword("to"))) { - if (parser.parseInteger(lvlHi)) - return failure(); - } else { - lvlHi = lvlLo + 1; - } - - if (lvlHi <= lvlLo) - parser.emitError(parser.getNameLoc(), - "expect larger level upper bound than lower bound"); - - return success(); -} - -/// Parses a level range in the form "$lo `to` $hi" -/// or simply "$lo" if $hi - $lo = 1 -static ParseResult parseLevelRange(OpAsmParser &parser, IntegerAttr &lvlLoAttr, - IntegerAttr &lvlHiAttr) { - Level lvlLo, lvlHi; - if (parseLevelRange(parser, lvlLo, lvlHi)) - return failure(); - - lvlLoAttr = IntegerAttr::get(parser.getBuilder().getIndexType(), lvlLo); - lvlHiAttr = IntegerAttr::get(parser.getBuilder().getIndexType(), lvlHi); - return success(); -} - -/// Prints a level range in the form "$lo `to` $hi" -/// or simply "$lo" if $hi - $lo = 1 -static void printLevelRange(AsmPrinter &p, Level lo, Level hi) { - - if (lo + 1 == hi) - p << lo; - else - p << lo << " to " << hi; -} - -/// Prints a level range in the form "$lo `to` $hi" -/// or simply "$lo" if $hi - $lo = 1 -static void printLevelRange(OpAsmPrinter &p, Operation *, IntegerAttr lvlLo, - IntegerAttr lvlHi) { - unsigned lo = lvlLo.getValue().getZExtValue(); - unsigned hi = lvlHi.getValue().getZExtValue(); - printLevelRange(p, lo, hi); -} - -static ParseResult -parseSparseSpaceLoop(OpAsmParser &parser, OperationState &state, - SmallVectorImpl &iterators, - SmallVectorImpl &iterArgs) { - SmallVector spaces; - SmallVector initArgs; - - // Parses "%iters, ... in %spaces, ..." - if (parser.parseArgumentList(iterators) || parser.parseKeyword("in") || - parser.parseOperandList(spaces)) - return failure(); - - if (iterators.size() != spaces.size()) - return parser.emitError( - parser.getNameLoc(), - "mismatch in number of sparse iterators and sparse spaces"); - - // Parse "at(%crd0, _, ...)" - LevelSet crdUsedLvlSet; - bool hasUsedCrds = succeeded(parser.parseOptionalKeyword("at")); - unsigned lvlCrdCnt = 0; - if (hasUsedCrds) { - ParseResult crdList = parser.parseCommaSeparatedList( - OpAsmParser::Delimiter::Paren, [&]() -> ParseResult { - if (parser.parseOptionalKeyword("_")) { - if (parser.parseArgument(iterArgs.emplace_back())) - return failure(); - // Always use IndexType for the coordinate. - crdUsedLvlSet.set(lvlCrdCnt); - iterArgs.back().type = parser.getBuilder().getIndexType(); - } - lvlCrdCnt += 1; - return success(); - }); - if (failed(crdList)) { - return parser.emitError( - parser.getNameLoc(), - "expecting SSA value or \"_\" for level coordinates"); - } - } - // Set the CrdUsedLvl bitset. - state.addAttribute("crdUsedLvls", - parser.getBuilder().getI64IntegerAttr(crdUsedLvlSet)); - - // Parse "iter_args(%arg = %init, ...)" - bool hasIterArgs = succeeded(parser.parseOptionalKeyword("iter_args")); - if (hasIterArgs) - if (parser.parseAssignmentList(iterArgs, initArgs)) - return failure(); - - SmallVector iterSpaceTps; - // parse ": sparse_tensor.iter_space -> ret" - if (parser.parseColon() || parser.parseTypeList(iterSpaceTps)) - return failure(); - if (iterSpaceTps.size() != spaces.size()) - return parser.emitError(parser.getNameLoc(), - "mismatch in number of iteration space operands " - "and iteration space types"); - - for (auto [it, tp] : llvm::zip_equal(iterators, iterSpaceTps)) { - IterSpaceType spaceTp = llvm::dyn_cast(tp); - if (!spaceTp) - return parser.emitError(parser.getNameLoc(), - "expected sparse_tensor.iter_space type for " - "iteration space operands"); - if (hasUsedCrds && spaceTp.getSpaceDim() != lvlCrdCnt) - return parser.emitError(parser.getNameLoc(), - "mismatch in number of iteration space dimension " - "and specified coordinates"); - it.type = spaceTp.getIteratorType(); - } - - if (hasIterArgs) - if (parser.parseArrowTypeList(state.types)) - return failure(); - - // Resolves input operands. - if (parser.resolveOperands(spaces, iterSpaceTps, parser.getNameLoc(), - state.operands)) - return failure(); - - if (hasIterArgs) { - unsigned numCrds = crdUsedLvlSet.count(); - // Strip off leading args that used for coordinates. - MutableArrayRef args = MutableArrayRef(iterArgs).drop_front(numCrds); - if (args.size() != initArgs.size() || args.size() != state.types.size()) { - return parser.emitError( - parser.getNameLoc(), - "mismatch in number of iteration arguments and return values"); - } - - for (auto [it, init, tp] : llvm::zip_equal(args, initArgs, state.types)) { - it.type = tp; - if (parser.resolveOperand(init, tp, state.operands)) - return failure(); - } - } - return success(); -} - -LogicalResult ExtractIterSpaceOp::inferReturnTypes( - MLIRContext *ctx, std::optional loc, ValueRange ops, - DictionaryAttr attr, OpaqueProperties prop, RegionRange region, - SmallVectorImpl &ret) { - - ExtractIterSpaceOp::Adaptor adaptor(ops, attr, prop, region); - SparseTensorType stt = getSparseTensorType(adaptor.getTensor()); - ret.push_back(IterSpaceType::get(ctx, stt.getEncoding(), adaptor.getLoLvl(), - adaptor.getHiLvl())); - return success(); -} - -LogicalResult ExtractIterSpaceOp::verify() { - if (getLoLvl() >= getHiLvl()) - return emitOpError("expected smaller level low than level high"); - - TypedValue pIter = getParentIter(); - if ((pIter && getLoLvl() == 0) || (!pIter && getLoLvl() != 0)) { - return emitOpError( - "parent iterator should be specified iff level lower bound equals 0"); - } - - if (pIter) { - IterSpaceType spaceTp = getResultSpace().getType(); - if (pIter.getType().getEncoding() != spaceTp.getEncoding()) - return emitOpError( - "mismatch in parent iterator encoding and iteration space encoding."); - - if (spaceTp.getLoLvl() != pIter.getType().getHiLvl()) - return emitOpError("parent iterator should be used to extract an " - "iteration space from a consecutive level."); - } - - return success(); -} - -ParseResult IterateOp::parse(OpAsmParser &parser, OperationState &result) { - OpAsmParser::Argument iterator; - OpAsmParser::UnresolvedOperand iterSpace; - - SmallVector iters, iterArgs; - if (parseSparseSpaceLoop(parser, result, iters, iterArgs)) - return failure(); - if (iters.size() != 1) - return parser.emitError(parser.getNameLoc(), - "expected only one iterator/iteration space"); - - iters.append(iterArgs); - Region *body = result.addRegion(); - if (parser.parseRegion(*body, iters)) - return failure(); - - IterateOp::ensureTerminator(*body, parser.getBuilder(), result.location); - - // Parse the optional attribute list. - if (parser.parseOptionalAttrDict(result.attributes)) - return failure(); - - return success(); -} - -/// Prints the initialization list in the form of -/// (%inner = %outer, %inner2 = %outer2, <...>) -/// where 'inner' values are assumed to be region arguments and 'outer' values -/// are regular SSA values. -static void printInitializationList(OpAsmPrinter &p, - Block::BlockArgListType blocksArgs, - ValueRange initializers, - StringRef prefix = "") { - assert(blocksArgs.size() == initializers.size() && - "expected same length of arguments and initializers"); - if (initializers.empty()) - return; - - p << prefix << '('; - llvm::interleaveComma(llvm::zip(blocksArgs, initializers), p, [&](auto it) { - p << std::get<0>(it) << " = " << std::get<1>(it); - }); - p << ")"; -} - -static void printUsedCrdsList(OpAsmPrinter &p, unsigned spaceDim, - Block::BlockArgListType blocksArgs, - LevelSet crdUsedLvls) { - if (crdUsedLvls.empty()) - return; - - p << " at("; - for (unsigned i = 0; i < spaceDim; i++) { - if (crdUsedLvls[i]) { - p << blocksArgs.front(); - blocksArgs = blocksArgs.drop_front(); - } else { - p << "_"; - } - if (i != spaceDim - 1) - p << ", "; - } - assert(blocksArgs.empty()); - p << ")"; -} - -void IterateOp::print(OpAsmPrinter &p) { - p << " " << getIterator() << " in " << getIterSpace(); - printUsedCrdsList(p, getSpaceDim(), getCrds(), getCrdUsedLvls()); - printInitializationList(p, getRegionIterArgs(), getInitArgs(), " iter_args"); - - p << " : " << getIterSpace().getType() << " "; - if (!getInitArgs().empty()) - p << "-> (" << getInitArgs().getTypes() << ") "; - - p.printRegion(getRegion(), /*printEntryBlockArgs=*/false, - /*printBlockTerminators=*/!getInitArgs().empty()); -} - -LogicalResult IterateOp::verify() { - if (getInitArgs().size() != getNumResults()) { - return emitOpError( - "mismatch in number of loop-carried values and defined values"); - } - return success(); -} - -LogicalResult IterateOp::verifyRegions() { - if (getIterator().getType() != getIterSpace().getType().getIteratorType()) - return emitOpError("mismatch in iterator and iteration space type"); - if (getNumRegionIterArgs() != getNumResults()) - return emitOpError( - "mismatch in number of basic block args and defined values"); - - auto initArgs = getInitArgs(); - auto iterArgs = getRegionIterArgs(); - auto yieldVals = getYieldedValues(); - auto opResults = getResults(); - if (!llvm::all_equal({initArgs.size(), iterArgs.size(), yieldVals.size(), - opResults.size()})) { - return emitOpError() << "number mismatch between iter args and results."; - } - - for (auto [i, init, iter, yield, ret] : - llvm::enumerate(initArgs, iterArgs, yieldVals, opResults)) { - if (init.getType() != ret.getType()) - return emitOpError() << "types mismatch between " << i - << "th iter operand and defined value"; - if (iter.getType() != ret.getType()) - return emitOpError() << "types mismatch between " << i - << "th iter region arg and defined value"; - if (yield.getType() != ret.getType()) - return emitOpError() << "types mismatch between " << i - << "th yield value and defined value"; - } - - return success(); -} - -/// IterateOp implemented OpInterfaces' methods. -SmallVector IterateOp::getLoopRegions() { return {&getRegion()}; } - -MutableArrayRef IterateOp::getInitsMutable() { - return getInitArgsMutable(); -} - -Block::BlockArgListType IterateOp::getRegionIterArgs() { - return getRegion().getArguments().take_back(getNumRegionIterArgs()); -} - -std::optional> IterateOp::getYieldedValuesMutable() { - return cast( - getRegion().getBlocks().front().getTerminator()) - .getResultsMutable(); -} - -std::optional IterateOp::getLoopResults() { return getResults(); } - -OperandRange IterateOp::getEntrySuccessorOperands(RegionBranchPoint point) { - return getInitArgs(); -} - -void IterateOp::getSuccessorRegions(RegionBranchPoint point, - SmallVectorImpl ®ions) { - // Both the operation itself and the region may be branching into the body or - // back into the operation itself. - regions.push_back(RegionSuccessor(&getRegion(), getRegionIterArgs())); - // It is possible for loop not to enter the body. - regions.push_back(RegionSuccessor(getResults())); -} - -//===----------------------------------------------------------------------===// -// Sparse Tensor Dialect Setups. -//===----------------------------------------------------------------------===// - /// Materialize a single constant operation from a given attribute value with /// the desired resultant type. Operation *SparseTensorDialect::materializeConstant(OpBuilder &builder, diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir index b13024cd4ed99d..7f5c05190fc9a2 100644 --- a/mlir/test/Dialect/SparseTensor/invalid.mlir +++ b/mlir/test/Dialect/SparseTensor/invalid.mlir @@ -1012,142 +1012,3 @@ func.func @sparse_print(%arg0: tensor<10x10xf64>) { sparse_tensor.print %arg0 : tensor<10x10xf64> return } - -// ----- - -#COO = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : compressed(nonunique), - j : singleton(soa) - ) -}> - -func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 2>) { - // expected-error@+1 {{'sparse_tensor.extract_iteration_space' expect larger level upper bound than lower bound}} - %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 2 to 0 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 2> - return -} - -// ----- - -#COO = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : compressed(nonunique), - j : singleton(soa) - ) -}> - -func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) { - // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be specified iff level lower bound equals 0}} - %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 0 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> - return -} - -// ----- - -#COO = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : compressed(nonunique), - j : singleton(soa) - ) -}> - -func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>) { - // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be specified iff level lower bound equals 0}} - %l1 = sparse_tensor.extract_iteration_space %sp lvls = 1 : tensor<4x8xf32, #COO> - return -} - -// ----- - -#COO = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : compressed(nonunique), - j : singleton(soa) - ) -}> - -#CSR = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : dense, - j : compressed - ) -}> - -func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#CSR, lvls = 0>) { - // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op mismatch in parent iterator encoding and iteration space encoding.}} - %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#CSR, lvls = 0> - return -} - -// ----- - -#COO = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : compressed(nonunique), - j : singleton(soa) - ) -}> - -func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) { - // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be used to extract an iteration space from a consecutive level.}} - %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 2 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> - return -} - - -// ----- - -#COO = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : compressed(nonunique), - j : singleton(soa) - ) -}> - -func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -> index { - %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> - // expected-error @+1 {{'sparse_tensor.iterate' op different number of region iter_args and yielded values: 2 != 1}} - %r1, %r2 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%si = %i, %sj = %j): !sparse_tensor.iter_space<#COO, lvls = 0> -> (index, index) { - sparse_tensor.yield %si : index - } - return %r1 : index -} - -// ----- - -#COO = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : compressed(nonunique), - j : singleton(soa) - ) -}> - -// expected-note@+1 {{prior use here}} -func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index) -> f32 { - %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> - // expected-error @+1 {{use of value '%i' expects different type than prior uses: 'f32' vs 'index'}} - %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%outer = %i): !sparse_tensor.iter_space<#COO, lvls = 0> -> f32 { - sparse_tensor.yield %outer : f32 - } - return %r1 : f32 -} - -// ----- - -#COO = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : compressed(nonunique), - j : singleton(soa) - ) -}> - -func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -> index { - %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> - // expected-error @+1 {{'sparse_tensor.iterate' op 0-th region iter_arg and 0-th yielded value have different type: 'index' != 'f32'}} - %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%si = %i): !sparse_tensor.iter_space<#COO, lvls = 0> -> index { - %y = arith.constant 1.0 : f32 - sparse_tensor.yield %y : f32 - } - return %r1 : index -} diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir index e9a898f16b41d2..12f69c1d37b9cd 100644 --- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir +++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir @@ -738,56 +738,3 @@ func.func @sparse_has_runtime() -> i1 { %has_runtime = sparse_tensor.has_runtime_library return %has_runtime : i1 } - -// ----- - -#COO = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : compressed(nonunique), - j : singleton(soa) - ) -}> - -// CHECK-LABEL: func.func @sparse_extract_iter_space( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<4x8xf32, #sparse{{[0-9]*}}>, -// CHECK-SAME: %[[VAL_1:.*]]: !sparse_tensor.iterator<#sparse{{[0-9]*}}, lvls = 0>) -// CHECK: %[[VAL_2:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] lvls = 0 -// CHECK: %[[VAL_3:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] at %[[VAL_1]] lvls = 1 -// CHECK: return %[[VAL_2]], %[[VAL_3]] : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0>, !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 1> -// CHECK: } -func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) - -> (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>) { - // Extracting the iteration space for the first level needs no parent iterator. - %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> - // Extracting the iteration space for the second level needs a parent iterator. - %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> - return %l1, %l2 : !sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1> -} - - -// ----- - -#COO = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : compressed(nonunique), - j : singleton(soa) - ) -}> - -// CHECK-LABEL: func.func @sparse_iterate( -// CHECK-SAME: %[[VAL_0:.*]]: tensor<4x8xf32, #sparse{{[0-9]*}}>, -// CHECK-SAME: %[[VAL_1:.*]]: index, -// CHECK-SAME: %[[VAL_2:.*]]: index) -> index { -// CHECK: %[[VAL_3:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] lvls = 0 : tensor<4x8xf32, #sparse{{[0-9]*}}> -// CHECK: %[[VAL_4:.*]] = sparse_tensor.iterate %[[VAL_5:.*]] in %[[VAL_3]] at(%[[VAL_6:.*]]) iter_args(%[[VAL_7:.*]] = %[[VAL_1]]) : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0> -> (index) { -// CHECK: sparse_tensor.yield %[[VAL_7]] : index -// CHECK: } -// CHECK: return %[[VAL_4]] : index -// CHECK: } -func.func @sparse_iterate(%sp : tensor<4x8xf32, #COO>, %i : index, %j : index) -> index { - %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> - %r1 = sparse_tensor.iterate %it1 in %l1 at (%crd) iter_args(%outer = %i): !sparse_tensor.iter_space<#COO, lvls = 0 to 1> -> index { - sparse_tensor.yield %outer : index - } - return %r1 : index -} diff --git a/mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir b/mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir deleted file mode 100644 index e7158d04b37feb..00000000000000 --- a/mlir/test/Dialect/SparseTensor/sparse_itertion_licm.mlir +++ /dev/null @@ -1,26 +0,0 @@ -// RUN: mlir-opt %s --loop-invariant-code-motion | FileCheck %s - -#CSR = #sparse_tensor.encoding<{ - map = (i, j) -> ( - i : dense, - j : compressed - ) -}> - -// Make sure that pure instructions are hoisted outside the loop. -// -// CHECK: sparse_tensor.values -// CHECK: sparse_tensor.positions -// CHECK: sparse_tensor.coordinate -// CHECK: sparse_tensor.iterate -func.func @sparse_iterate(%sp : tensor) { - %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor - sparse_tensor.iterate %it1 in %l1 at (%crd) : !sparse_tensor.iter_space<#CSR, lvls = 0> { - %0 = sparse_tensor.values %sp : tensor to memref - %1 = sparse_tensor.positions %sp { level = 1 : index } : tensor to memref - %2 = sparse_tensor.coordinates %sp { level = 1 : index } : tensor to memref - "test.op"(%0, %1, %2) : (memref, memref, memref) -> () - } - - return -} From 481bd5d416df7a1d24e18cc81ae782e8701de965 Mon Sep 17 00:00:00 2001 From: Peiming Liu Date: Tue, 16 Apr 2024 11:32:30 -0700 Subject: [PATCH 148/300] [mlir][sparse] introduce `sparse_tensor.extract_iteration_space` operation. (#88554) A `sparse_tensor.extract_space %tensor at %iterator` extracts a *sparse* iteration space defined `%tensor`, the operation to traverse the iteration space will be introduced in following PRs. --- .../SparseTensor/IR/SparseTensorOps.td | 60 ++++++++++ .../SparseTensor/IR/SparseTensorTypes.td | 97 +++++++++++++++ .../SparseTensor/IR/SparseTensorDialect.cpp | 110 ++++++++++++++++++ mlir/test/Dialect/SparseTensor/invalid.mlir | 82 +++++++++++++ mlir/test/Dialect/SparseTensor/roundtrip.mlir | 25 ++++ 5 files changed, 374 insertions(+) diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td index 0cfc64f9988a0a..d7121e8320a4bc 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td @@ -1430,6 +1430,66 @@ def SparseTensor_ForeachOp : SparseTensor_Op<"foreach", let hasVerifier = 1; } +//===----------------------------------------------------------------------===// +// Sparse Tensor Iteration Operations. +//===----------------------------------------------------------------------===// + +def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space", + [Pure, DeclareOpInterfaceMethods]> { + + let arguments = (ins AnySparseTensor:$tensor, + Optional:$parentIter, + LevelAttr:$loLvl, LevelAttr:$hiLvl); + + let results = (outs AnySparseIterSpace:$resultSpace); + + let summary = "Extracts an iteration space from a sparse tensor between certain levels"; + let description = [{ + Extracts a `!sparse_tensor.iter_space` from a sparse tensor between + certain (consecutive) levels. For sparse levels, it is usually done by + loading a postion range from the underlying sparse tensor storage. + E.g., for a compressed level, the iteration space is extracted by + [pos[i], pos[i+1]) supposing the the parent iterator points at `i`. + + `tensor`: the input sparse tensor that defines the iteration space. + `parentIter`: the iterator for the previous level, at which the iteration space + at the current levels will be extracted. + `loLvl`, `hiLvl`: the level range between [loLvl, hiLvl) in the input tensor that + the returned iteration space covers. `hiLvl - loLvl` defines the dimension of the + iteration space. + + The type of returned the value is automatically inferred to + `!sparse_tensor.iter_space<#INPUT_ENCODING, lvls = $loLvl to $hiLvl>`. + The returned iteration space can then be iterated over by + `sparse_tensor.iterate` operations to visit every stored element + (usually nonzeros) in the input sparse tensor. + + Example: + ```mlir + // Extracts a 1-D iteration space from a COO tensor at level 1. + %space = sparse_tensor.iteration.extract_space %sp at %it1 lvls = 1 + : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> + ``` + }]; + + + let extraClassDeclaration = [{ + std::pair getLvlRange() { + return std::make_pair(getLoLvl(), getHiLvl()); + } + unsigned getSpaceDim() { + return getHiLvl() - getLoLvl(); + } + ArrayRef<::mlir::sparse_tensor::LevelType> getSpaceLvlTypes() { + return getResultSpace().getType().getLvlTypes(); + } + }]; + + let hasVerifier = 1; + let assemblyFormat = "$tensor (`at` $parentIter^)? `lvls` `=` custom($loLvl, $hiLvl) " + " attr-dict `:` type($tensor) (`,` type($parentIter)^)?"; +} + //===----------------------------------------------------------------------===// // Sparse Tensor Debugging and Test-Only Operations. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td index 185cff46ae25d5..79113d8778743c 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td @@ -72,4 +72,101 @@ def SparseTensorStorageSpecifier : Type($_self)">, "metadata", "::mlir::sparse_tensor::StorageSpecifierType">; +//===----------------------------------------------------------------------===// +// Sparse Tensor Iteration Types. +//===----------------------------------------------------------------------===// + +def SparseTensor_IterSpace : SparseTensor_Type<"IterSpace"> { + let mnemonic = "iter_space"; + + let description = [{ + A sparse iteration space that represents an abstract N-D (sparse) iteration space + extracted from a sparse tensor, i.e., a set of (crd_0, crd_1, ..., crd_N) for + every stored element (usually nonzeros) in a sparse tensor between the specified + [$loLvl, $hiLvl) levels. + + Examples: + + ```mlir + // An iteration space extracted from a CSR tensor between levels [0, 2). + !iter_space<#CSR, lvls = 0 to 2> + ``` + }]; + + let parameters = (ins + SparseTensorEncodingAttr : $encoding, + "Level" : $loLvl, + "Level" : $hiLvl + ); + + let extraClassDeclaration = [{ + /// The the dimension of the iteration space. + unsigned getSpaceDim() const { + return getHiLvl() - getLoLvl(); + } + + /// Get the level types for the iteration space. + ArrayRef getLvlTypes() const { + return getEncoding().getLvlTypes().slice(getLoLvl(), getSpaceDim()); + } + + /// Whether the iteration space is unique (i.e., no duplicated coordinate). + bool isUnique() { + return !getLvlTypes().back().isa(); + } + + /// Get the corresponding iterator type. + ::mlir::sparse_tensor::IteratorType getIteratorType() const; + }]; + + let assemblyFormat="`<` $encoding `,` `lvls` `=` custom($loLvl, $hiLvl) `>`"; +} + +def SparseTensor_Iterator : SparseTensor_Type<"Iterator"> { + let mnemonic = "iterator"; + + let description = [{ + An iterator that points to the current element in the corresponding iteration space. + + Examples: + + ```mlir + // An iterator that iterates over a iteration space of type `!iter_space<#CSR, lvls = 0 to 2>` + !iterator<#CSR, lvls = 0 to 2> + ``` + }]; + + let parameters = (ins + SparseTensorEncodingAttr : $encoding, + "Level" : $loLvl, + "Level" : $hiLvl + ); + + let extraClassDeclaration = [{ + /// Get the corresponding iteration space type. + ::mlir::sparse_tensor::IterSpaceType getIterSpaceType() const; + + unsigned getSpaceDim() const { return getIterSpaceType().getSpaceDim(); } + ArrayRef getLvlTypes() const { return getIterSpaceType().getLvlTypes(); } + bool isUnique() { return getIterSpaceType().isUnique(); } + }]; + + let assemblyFormat="`<` $encoding `,` `lvls` `=` custom($loLvl, $hiLvl) `>`"; +} + +def IsSparseSparseIterSpaceTypePred + : CPred<"::llvm::isa<::mlir::sparse_tensor::IterSpaceType>($_self)">; + +def IsSparseSparseIteratorTypePred + : CPred<"::llvm::isa<::mlir::sparse_tensor::IteratorType>($_self)">; + +def AnySparseIterSpace + : Type; + +def AnySparseIterator + : Type; + + #endif // SPARSETENSOR_TYPES diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp index e9058394d33da5..516b0943bdcfac 100644 --- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp +++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp @@ -30,6 +30,14 @@ #include "mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.cpp.inc" #include "mlir/Dialect/SparseTensor/IR/SparseTensorAttrEnums.cpp.inc" +// Forward declarations, following custom print/parsing methods are referenced +// by the generated code for SparseTensorTypes.td. +static mlir::ParseResult parseLevelRange(mlir::AsmParser &, + mlir::sparse_tensor::Level &, + mlir::sparse_tensor::Level &); +static void printLevelRange(mlir::AsmPrinter &, mlir::sparse_tensor::Level, + mlir::sparse_tensor::Level); + #define GET_TYPEDEF_CLASSES #include "mlir/Dialect/SparseTensor/IR/SparseTensorTypes.cpp.inc" @@ -1953,6 +1961,108 @@ LogicalResult SortOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// Sparse Tensor Iteration Operations. +//===----------------------------------------------------------------------===// + +IterSpaceType IteratorType::getIterSpaceType() const { + return IterSpaceType::get(getContext(), getEncoding(), getLoLvl(), + getHiLvl()); +} + +IteratorType IterSpaceType::getIteratorType() const { + return IteratorType::get(getContext(), getEncoding(), getLoLvl(), getHiLvl()); +} + +/// Parses a level range in the form "$lo `to` $hi" +/// or simply "$lo" if $hi - $lo = 1 +static ParseResult parseLevelRange(AsmParser &parser, Level &lvlLo, + Level &lvlHi) { + if (parser.parseInteger(lvlLo)) + return failure(); + + if (succeeded(parser.parseOptionalKeyword("to"))) { + if (parser.parseInteger(lvlHi)) + return failure(); + } else { + lvlHi = lvlLo + 1; + } + + if (lvlHi <= lvlLo) + parser.emitError(parser.getNameLoc(), + "expect larger level upper bound than lower bound"); + + return success(); +} + +/// Parses a level range in the form "$lo `to` $hi" +/// or simply "$lo" if $hi - $lo = 1 +static ParseResult parseLevelRange(OpAsmParser &parser, IntegerAttr &lvlLoAttr, + IntegerAttr &lvlHiAttr) { + Level lvlLo, lvlHi; + if (parseLevelRange(parser, lvlLo, lvlHi)) + return failure(); + + lvlLoAttr = IntegerAttr::get(parser.getBuilder().getIndexType(), lvlLo); + lvlHiAttr = IntegerAttr::get(parser.getBuilder().getIndexType(), lvlHi); + return success(); +} + +/// Prints a level range in the form "$lo `to` $hi" +/// or simply "$lo" if $hi - $lo = 1 +static void printLevelRange(AsmPrinter &p, Level lo, Level hi) { + + if (lo + 1 == hi) + p << lo; + else + p << lo << " to " << hi; +} + +/// Prints a level range in the form "$lo `to` $hi" +/// or simply "$lo" if $hi - $lo = 1 +static void printLevelRange(OpAsmPrinter &p, Operation *, IntegerAttr lvlLo, + IntegerAttr lvlHi) { + unsigned lo = lvlLo.getValue().getZExtValue(); + unsigned hi = lvlHi.getValue().getZExtValue(); + printLevelRange(p, lo, hi); +} + +LogicalResult ExtractIterSpaceOp::inferReturnTypes( + MLIRContext *ctx, std::optional loc, ValueRange ops, + DictionaryAttr attr, OpaqueProperties prop, RegionRange region, + SmallVectorImpl &ret) { + + ExtractIterSpaceOp::Adaptor adaptor(ops, attr, prop, region); + SparseTensorType stt = getSparseTensorType(adaptor.getTensor()); + ret.push_back(IterSpaceType::get(ctx, stt.getEncoding(), adaptor.getLoLvl(), + adaptor.getHiLvl())); + return success(); +} + +LogicalResult ExtractIterSpaceOp::verify() { + if (getLoLvl() >= getHiLvl()) + return emitOpError("expected smaller level low than level high"); + + TypedValue pIter = getParentIter(); + if ((pIter && getLoLvl() == 0) || (!pIter && getLoLvl() != 0)) { + return emitOpError( + "parent iterator should be specified iff level lower bound equals 0"); + } + + if (pIter) { + IterSpaceType spaceTp = getResultSpace().getType(); + if (pIter.getType().getEncoding() != spaceTp.getEncoding()) + return emitOpError( + "mismatch in parent iterator encoding and iteration space encoding."); + + if (spaceTp.getLoLvl() != pIter.getType().getHiLvl()) + return emitOpError("parent iterator should be used to extract an " + "iteration space from a consecutive level."); + } + + return success(); +} + /// Materialize a single constant operation from a given attribute value with /// the desired resultant type. Operation *SparseTensorDialect::materializeConstant(OpBuilder &builder, diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir index 7f5c05190fc9a2..3fa696e1600a93 100644 --- a/mlir/test/Dialect/SparseTensor/invalid.mlir +++ b/mlir/test/Dialect/SparseTensor/invalid.mlir @@ -1012,3 +1012,85 @@ func.func @sparse_print(%arg0: tensor<10x10xf64>) { sparse_tensor.print %arg0 : tensor<10x10xf64> return } + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 2>) { + // expected-error@+1 {{'sparse_tensor.extract_iteration_space' expect larger level upper bound than lower bound}} + %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 2 to 0 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 2> + return +} + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) { + // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be specified iff level lower bound equals 0}} + %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 0 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> + return +} + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>) { + // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be specified iff level lower bound equals 0}} + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 1 : tensor<4x8xf32, #COO> + return +} + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +#CSR = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : dense, + j : compressed + ) +}> + +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#CSR, lvls = 0>) { + // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op mismatch in parent iterator encoding and iteration space encoding.}} + %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#CSR, lvls = 0> + return +} + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) { + // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be used to extract an iteration space from a consecutive level.}} + %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 2 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> + return +} diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir index 12f69c1d37b9cd..d34071279e5129 100644 --- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir +++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir @@ -738,3 +738,28 @@ func.func @sparse_has_runtime() -> i1 { %has_runtime = sparse_tensor.has_runtime_library return %has_runtime : i1 } + +// ----- + +#COO = #sparse_tensor.encoding<{ + map = (i, j) -> ( + i : compressed(nonunique), + j : singleton(soa) + ) +}> + +// CHECK-LABEL: func.func @sparse_extract_iter_space( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<4x8xf32, #sparse{{[0-9]*}}>, +// CHECK-SAME: %[[VAL_1:.*]]: !sparse_tensor.iterator<#sparse{{[0-9]*}}, lvls = 0>) +// CHECK: %[[VAL_2:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] lvls = 0 +// CHECK: %[[VAL_3:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] at %[[VAL_1]] lvls = 1 +// CHECK: return %[[VAL_2]], %[[VAL_3]] : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0>, !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 1> +// CHECK: } +func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) + -> (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>) { + // Extracting the iteration space for the first level needs no parent iterator. + %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO> + // Extracting the iteration space for the second level needs a parent iterator. + %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0> + return %l1, %l2 : !sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1> +} From edb711d2f318b17489692b5f85028fab7ed85b83 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 16 Apr 2024 13:35:23 -0500 Subject: [PATCH 149/300] [InstCombine] Update `vector_reduce_and` tests to actually use `llvm.vector.reduce.and`; NFC --- llvm/test/Transforms/InstCombine/known-bits.ll | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll index 85a21332b07889..8b4249b2c25a92 100644 --- a/llvm/test/Transforms/InstCombine/known-bits.ll +++ b/llvm/test/Transforms/InstCombine/known-bits.ll @@ -1223,7 +1223,7 @@ define i8 @known_reduce_and(<2 x i8> %xx) { ; CHECK-NEXT: ret i8 1 ; %x = or <2 x i8> %xx, - %v = call i8 @llvm.vector.reduce.or(<2 x i8> %x) + %v = call i8 @llvm.vector.reduce.and(<2 x i8> %x) %r = and i8 %v, 1 ret i8 %r } @@ -1231,12 +1231,12 @@ define i8 @known_reduce_and(<2 x i8> %xx) { define i8 @known_reduce_and_fail(<2 x i8> %xx) { ; CHECK-LABEL: @known_reduce_and_fail( ; CHECK-NEXT: [[X:%.*]] = or <2 x i8> [[XX:%.*]], -; CHECK-NEXT: [[V:%.*]] = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> [[X]]) +; CHECK-NEXT: [[V:%.*]] = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> [[X]]) ; CHECK-NEXT: [[R:%.*]] = and i8 [[V]], 2 ; CHECK-NEXT: ret i8 [[R]] ; %x = or <2 x i8> %xx, - %v = call i8 @llvm.vector.reduce.or(<2 x i8> %x) + %v = call i8 @llvm.vector.reduce.and(<2 x i8> %x) %r = and i8 %v, 2 ret i8 %r } From 5c6af605b307213453a9a043532b9293db21b5c6 Mon Sep 17 00:00:00 2001 From: mahtohappy Date: Wed, 17 Apr 2024 00:12:14 +0530 Subject: [PATCH 150/300] [Clang][Sema] placement new initializes typedef array with correct size (#88902) Build Failure Fix Fixes build failures due to #83124 --- .../{instantiate-new-placement-size.cpp => PR41441.cpp} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename clang/test/SemaCXX/{instantiate-new-placement-size.cpp => PR41441.cpp} (75%) diff --git a/clang/test/SemaCXX/instantiate-new-placement-size.cpp b/clang/test/SemaCXX/PR41441.cpp similarity index 75% rename from clang/test/SemaCXX/instantiate-new-placement-size.cpp rename to clang/test/SemaCXX/PR41441.cpp index 7a29d3dee8491e..0b012b33fce343 100644 --- a/clang/test/SemaCXX/instantiate-new-placement-size.cpp +++ b/clang/test/SemaCXX/PR41441.cpp @@ -1,5 +1,5 @@ -// RUN: %clang -S -fno-discard-value-names -emit-llvm -o - %s | FileCheck %s -// Issue no: 41441 +// RUN: %clang --target=x86_64-pc-linux -S -fno-discard-value-names -emit-llvm -o - %s | FileCheck %s + #include // CHECK: call void @llvm.memset.p0.i64(ptr align 1 %x, i8 0, i64 8, i1 false) From b01879ec1ffbd249f9bf3c4f32308443be6ac36b Mon Sep 17 00:00:00 2001 From: Chao Chen <116223022+chencha3@users.noreply.github.com> Date: Tue, 16 Apr 2024 13:44:14 -0500 Subject: [PATCH 151/300] [MLIR][XeGPU] Add XeGPU scattered ops (#86594) - Extended TensorDescAttr with scattered attribute - Add scattered ops: CreateDescOp, PrefetchOp, LoadGatherOp, StoreScatterOp, UpdateOffsetOp - Add a block op: UpdateNdOffsetOp --------- Co-authored-by: Mehdi Amini Co-authored-by: Adam Siemieniuk --- mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h | 1 + .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 42 +- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 464 +++++++++++++++--- .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 41 +- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 21 + mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 254 +++++++++- mlir/test/Dialect/XeGPU/XeGPUOps.mlir | 62 +++ mlir/test/Dialect/XeGPU/invalid.mlir | 159 ++++++ 8 files changed, 937 insertions(+), 107 deletions(-) create mode 100644 mlir/test/Dialect/XeGPU/invalid.mlir diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 87aabdc015fea5..eca9255ff3974b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -12,6 +12,7 @@ #include "mlir/Bytecode/BytecodeOpInterface.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Dialect.h" +#include "mlir/IR/TypeUtilities.h" #include "mlir/Interfaces/ShapedOpInterfaces.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Interfaces/ViewLikeInterface.h" diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index cd38549f1ccf43..6579d07ec26215 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -19,17 +19,36 @@ class XeGPUAttr traits = [], } def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { + let summary = [{a composite attribute for `TensorDescType`}]; + let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite + attribute defined for `TensorDescType` for describing following + properties of a `TensorDesc`. + 1. `memory_scope`: It describes where the data block described by the + TensorDesc is located, `Global` device memory or `Shared` local memory. + It is default to `Global`. + 2. `array_length`: It describes how many horizontally consecutive blocks + will be loaded by a hardware load instruction. If the TensorDesc shape + is 8x16, with array_length = 2. The loaded block shape will be acctually + 8x32. Its default value is 1. + 3. `boundary_check`: It is used to indicates the hardware whether to do + out-of-boundary check. The default value is true. + 4. `scattered`: It is used to differenciate TensorDescs created from + `create_nd_tdesc` vs from `create_tdesc`. + }]; + let parameters = (ins OptionalParameter<"MemoryScopeAttr">: $memory_scope, OptionalParameter<"IntegerAttr", "1">: $array_length, - OptionalParameter<"BoolAttr", "true">: $boundary_check + OptionalParameter<"BoolAttr", "true">: $boundary_check, + OptionalParameter<"BoolAttr", "false">: $scattered ); let builders = [ AttrBuilder<(ins CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, CArg<"int", "1">:$array_length, - CArg<"bool", "true">: $boundary_check + CArg<"bool", "true">: $boundary_check, + CArg<"bool", "false">: $scattered )> ]; @@ -41,15 +60,17 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { //===----------------------------------------------------------------------===// def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">; def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">; -def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", - "The address space of the memory the tensor descritor is created for", +def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", + "The address space of the memory the tensor descritor is created for", [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::xegpu"; } -def XeGPU_MemoryScopeAttr: +def XeGPU_MemoryScopeAttr: EnumAttr { + let summary = [{Describe the location of data described by a `TensorDesc`: + Global device memory (`Global`) or Shared local memory (`SLM`).}]; let assemblyFormat = "$value"; } @@ -63,19 +84,18 @@ def XeGPU_CachePolicyInvalid: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_ def XeGPU_CachePolicyWriteBack: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only def XeGPU_CachePolicyWriteThrough: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only -def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy", - [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached, +def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy", + [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached, XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid, XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> { let genSpecializedAttr = 0; let cppNamespace = "::mlir::xegpu"; } -def XeGPU_CacheHintAttr +def XeGPU_CacheHintAttr : EnumAttr { + let summary = [{Describe the cache settings for prefetch/load/store operators}]; let assemblyFormat = "`<` $value `>`"; } - - -#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD +#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD \ No newline at end of file diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index b8ebd1a40c6073..c6f7f83441b96c 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -47,36 +47,35 @@ class XeGPU_Op traits = []>: } -def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, +def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> { let summary = "Create nd-tensor descriptor operation"; let description = [{ The "create_nd_tdesc" operation creates a TensorDescType which represents a sub-view of a 2D memory region (It can be extended to support n-D memory - region if needed in future). Elements in the subview continuous in each - dimention. It encodes the following important information for supporting + region if needed in future). Elements in the subview continuous in each + dimension. It encodes the following important information for supporting Intel hardware features: - * source: an object representing (starting address/pointer of) a 2D memory region. + * source: an object representing (starting address/pointer of) a 2D memory region. It can be either a 2D memref object, or simply a pointer represented by uint64_t type. - for the later case, the shape and layout information of the 2D memory region should - be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters. - * offsets: two index values represents offsets from the "source" at the each dimension + for the later case, the shape and layout information of the 2D memory region should + be explicitly passed via `shape` and `strides` parameters. + * offsets: two index values represents offsets from the "source" at the each dimension at which the subview of the target memory will be created. It is encoded via two - variables, including "dynamic_offsets" and "static_offsets", such that it can - accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])). - * shape: the shape information of the memory region pointed by the "source". It is - typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. - But if "source" is simply a pointer represented as uint64_t type, or a memref - type without shape information e.g., memref, the shape information has - to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" - only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]). - * strides: the strides of the memory region pointed by the "source". Similar to shape, - it is typically encoded via the MemRefType of the source too. But if "source" is - simply a pointer represented as uint64_t type, or a memref type without shape - information e.g., memref, the strides information has to be explicitly - passed via the "dynamic_strides" argument. And it currently only accepts operands two. + variables, including "offsets" and "const_offsets", such that it can + accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]). + * shape: the shape information of the memory region pointed by the "source". It is + typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. + But if "source" is simply a pointer represented as uint64_t type, or a memref + type without shape information e.g., memref, the shape information has + to be explicitly passed via the "shape" and "const_shape" arguments. + * strides: the strides of the memory region pointed by the "source". Similar to shape, + it is typically encoded via the MemRefType of the source too. But if "source" is + simply a pointer represented as uint64_t type, or a memref type without shape + information e.g., memref, the strides information has to be explicitly + passed via the "strides" and "const_strides" argument. Example 1 (suppose the tensor shape inferred by the compiler is 8x16): %0 = memref.alloc() : memref<1024x1024xf32> @@ -97,10 +96,10 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32> }]; - let arguments = (ins - XeGPU_BaseAddrType: $source, - Variadic: $offsets, - Variadic: $shape, + let arguments = (ins + XeGPU_BaseAddrType: $source, + Variadic: $offsets, + Variadic: $shape, Variadic: $strides, DenseI64ArrayAttr: $const_offsets, OptionalAttr: $const_shape, @@ -119,12 +118,12 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface let hasVerifier = 1; let builders = [ - OpBuilder<(ins "Type": $tdesc, "TypedValue": $source, + OpBuilder<(ins "Type": $tdesc, "TypedValue": $source, "llvm::ArrayRef": $offsets)>, - OpBuilder<(ins "Type": $tdesc, "TypedValue ": $source, + OpBuilder<(ins "Type": $tdesc, "TypedValue ": $source, "llvm::ArrayRef": $offsets, - "llvm::ArrayRef": $shape, + "llvm::ArrayRef": $shape, "llvm::ArrayRef": $strides)> ]; @@ -159,41 +158,41 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface } /// wrapper for matching with OffsetSizeAndStrideOpInterface - /// If source is IntegerType or `const_shape` is filled, + /// If source is IntegerType or `const_shape` is filled, /// it will return `const_shape`, such that mixes of `shape` - /// and `const_shape` will be used to represent the shape of + /// and `const_shape` will be used to represent the shape of /// source operand. They overide static shape from source memref type. ArrayRef getStaticSizes() { auto attr = getConstShapeAttr(); if (getSourceType().isa() || attr) return attr; - + auto memrefType = getSourceType().dyn_cast(); assert(memrefType && "Incorrect use of getStaticSizes"); return memrefType.getShape(); } /// wrapper for matching with OffsetSizeAndStrideOpInterface - /// If source is IntegerType or `const_strides` is filled, it + /// If source is IntegerType or `const_strides` is filled, it /// will return `const_strides`, such that mixes of `strides` - /// and `const_strides` will be used to represent the strides of + /// and `const_strides` will be used to represent the strides of /// source operand. They overide static strides from source memref type. ArrayRef getStaticStrides() { auto attr = getConstStridesAttr(); if (getSourceType().isa() || attr) return attr; - + auto memrefType = getSourceType().dyn_cast(); assert(memrefType && "Incorrect use of getStaticStrides"); auto [strides, offset] = getStridesAndOffset(memrefType); - // reuse the storage of ConstStridesAttr since strides from + // reuse the storage of ConstStridesAttr since strides from // memref is not persistant setConstStrides(strides); attr = getConstStridesAttr(); return attr; } - /// Return the expected rank of each of the`static_offsets`, + /// Return the expected rank of each of the`static_offsets`, /// `static_shape` and `static_strides` attributes. std::array getArrayAttrMaxRanks() { unsigned rank; @@ -204,8 +203,8 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface } return {rank, rank, rank}; } - - /// Return the number of leading operands before the `offsets`, + + /// Return the number of leading operands before the `offsets`, /// `shape` and `strides` operands. static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; } @@ -214,15 +213,15 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface } def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { - let summary = "prefetches a nD block to cache"; + let summary = "prefetches a n-D block to cache"; let description = [{ - It issues an instruction to prefetch the data from memory to each - level of the cache based on their cache policy. + It issues an instruction to prefetch a block of data from continuous + memory regions to each level of the cache based on their cache policy. Example: ``` - xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, + xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<8x16xf16> ``` @@ -233,34 +232,41 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint); - - let extraClassDeclaration = extraBaseClassDeclaration; + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + }]; let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))"; + + let hasVerifier = 1; } -def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> { - let summary = "loads a n-D block from memory (represented by TensorDesc)" +def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>, + AllElementCountsMatch<["value", "TensorDesc"]>]> { + let summary = "loads a n-D block from memory (represented by TensorDesc)" "to registers (represented by vector)"; let description = [{ - LoadNdOp essentially mimics the hardware block read instruction to read - a block of data from memory to register. It takes a set of optional cache - hints for each level of cache, L1, L2 and L3. If hardware does not have a + LoadNdOp essentially mimics the hardware block read instruction to read + a block of data from memory to register. It takes a set of optional cache + hints for each level of cache, L1, L2 and L3. If hardware does not have a correspoding cache, Corresponding cache hint attribute will be masked. - vnni transform is an hardware feature for Intel GPU, which is used to - do data packing during the load for B operand of matrix operation, if - the bit width of the data type is less then 32 bits, e.g., fp16. And + vnni transform is an hardware feature for Intel GPU, which is used to + do data packing during the load for B operand of matrix operation, if + the bit width of the data type is less then 32 bits, e.g., fp16. And transpose is another Intel hardware feature, which will do transpose - operation when loading the data if the bit width of the data type is - fp32 or fp64. It implies that vnni and transpose cannot exit at the + operation when loading the data if the bit width of the data type is + fp32 or fp64. It implies that vnni and transpose cannot exit at the same time. Example: ``` xegpu.load_nd %1 {transpose = [1, 0], - l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> ``` @@ -291,20 +297,21 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> { let hasVerifier = 1; } -def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> { +def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllShapesMatch<["value", "TensorDesc"]>, + AllElementTypesMatch<["value", "TensorDesc"]>]> { let summary = "stores a n-D block register region back to memory, currently only supports 2D"; let description = [{ StoreNdOp essentially mimics the hardware block write instruction io - write a block of data from register into the memory region as described - by the TensorDesc. It takes a set of optional cache hints for each level - of cache, L1, L2 and L3. If hardware does not have a correspoding cache, + write a block of data from register into the memory region as described + by the TensorDesc. It takes a set of optional cache hints for each level + of cache, L1, L2 and L3. If hardware does not have a correspoding cache, Corresponding cache hint attribute will be masked. Example: ``` xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint} : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> ``` @@ -318,11 +325,342 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> { OptionalAttr: $l2_hint, OptionalAttr: $l3_hint); - let extraClassDeclaration = extraBaseClassDeclaration; + let extraClassDeclaration = extraBaseClassDeclaration # [{ + VectorType getValueType() { + return llvm::dyn_cast(getValue().getType()); + } - let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + }]; + + let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict `:` type($value) `,` qualified(type($TensorDesc))}]; let hasVerifier = 1; } +def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset", + [AllTypesMatch<["TensorDesc", "result"]>]> { + let summary = "It updates the offsets for the TensorDesc."; + let description = [{The op updates the offset of the given TensorDesc. + The offsets are relative offset to the current position in the number + of elements. It will result in a same type TensorDesc as the input. + + example: + ``` + %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32> + ``` + }]; + + let arguments = (ins + XeGPU_TensorDesc: $TensorDesc, + Variadic: $offsets, + DenseI64ArrayAttr: $const_offsets); + + let results = (outs XeGPU_TensorDesc: $result); + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + + SmallVector getMixedOffsets() { + Builder b(getContext()); + return getMixedValues(getConstOffsets(), getOffsets(), b); + } + + size_t getNumOffsets() { + return getMixedOffsets().size(); + } + + OpFoldResult getOffset(unsigned idx) { + assert(idx < getNumOffsets() && "Invalid out of bound access."); + return getMixedOffsets()[idx]; + } + }]; + + let assemblyFormat = [{ + $TensorDesc `,` + custom($offsets, $const_offsets) + attr-dict `:` qualified(type($result)) + }]; + + let hasVerifier = 1; +} + +def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { + let summary = "create scattered tensor descriptors (TensorDesc)."; + let description = [{ + "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates + a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc" + is for creating continuous subviews, "create_tdesc" is for creating non-continuous + (scattered) subviews, allowing each work-item in a subgroup specifying their own offset. + It accepts the following parameters: + + * source: a 1D memref or pointer (uint64_t) represents the flattened memory object. + * offsets: a array containing offsets of each access point. Its size + is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, + implying each element in the array corresponds to a work-item (SIMT lane) + in the subgroup. + * chunk_size: [optional attribute] indicates number of continious + elements accessed for each offset, default is 1. + + Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] + ``` + %a = memref.alloc() : memref<1024xf32> + %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32> + ``` + + Example 2. It assumes subgroup size is 4, and each workitem access 8 elements. + It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] + ``` + %0 = memref.alloc() : memref<1024xf32> + %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> + ``` + + Example 3. It is similar to Example 2, but there is some overlaps among workitems. + It accesses: a[0:7], a[4:11], a[8:15], a[12:19] + ``` + %0 = memref.alloc() : memref<1024xf32> + %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> + ``` + + + + + }]; + + let arguments = (ins XeGPU_BaseAddrType: $source, + Variadic: $offsets, + DenseI64ArrayAttr: $const_offsets, + DefaultValuedAttr: $chunk_size); + let results = (outs XeGPU_TensorDesc:$TensorDesc); + + let builders = [ + OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source, + "llvm::ArrayRef": $offsets, + CArg<"uint32_t", "1"> : $chunk_size)>, + ]; + + let assemblyFormat = [{ + $source + custom($offsets, $const_offsets) + attr-dict `:` type($source) `->` qualified(type($TensorDesc)) + }]; + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + + SmallVector getMixedOffsets() { + Builder b(getContext()); + return getMixedValues(getConstOffsets(), getOffsets(), b); + } + + size_t getNumOffsets() { + return getMixedOffsets().size(); + } + + mlir::Value getViewSource() { return getSource(); } + + OpFoldResult getOffset(unsigned idx) { + assert(idx < getNumOffsets() && "Invalid out of bound access."); + return getMixedOffsets()[idx]; + } + }]; + + let hasVerifier = 1; +} + +def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> { + let summary = "prefetches a set of scattered data points to cache"; + + let description = [{ + It issues instructions to prefetch a set of scattered data points + from memory to each level of the cache based on their cache policy. + As compared to prefetch_nd, which works on non-scattered TensorDesc, + it works on scattered TensorDesc instead. + + Example: + ``` + xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<16xf16> + ``` + + }]; + + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + }]; + + let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))"; + + let hasVerifier = 1; +} + +def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]>, + AllElementTypesMatch<["value", "TensorDesc"]>, + AllElementCountsMatch<["value", "TensorDesc"]>]> { + let summary = "load a set of scattered data points from memory."; + + let description = [{ It (aka. load) load data per each work-item. The output + describes the data being loaded at the subgroup level, so its size is + consistent with the number of work-items in a subgroup. When `chunk_size_per_lane` + attribute is larger than 1 in TensorDesc, the output vector will be 2D vector, + with dim-1 correspoding to the chunk size. + + The mask operand masks out memory access so that it is safe to pass out-of-boundary + addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. + + Example: + ``` + %2 = xegpu.load %1, %0 {transpose = [1, 0], + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + -> vector<16xf32> + ``` + + }]; + + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + XeGPU_MaskType: $mask, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + let results = (outs XeGPU_ValueType: $value); + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + + mlir::Type getElementType() { + auto type = getValue().getType(); + return getElementTypeOrSelf(type); + } + + Type getValueType() { + return getValue().getType(); + } + + Type getMaskType() { + return getMask().getType(); + } + + }]; + + let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict + `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}]; + + let hasVerifier = 1; +} + +def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>, + AllElementTypesMatch<["value", "TensorDesc"]>]> { + let summary = "store data to scattered memory locations."; + let description = [{ It (aka. store) stores data to scattered memory locations. + It has similar semantic to `load_gather`. + + Example: + ``` + %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> + ``` + }]; + + let arguments = (ins + XeGPU_ValueType: $value, + XeGPU_TensorDesc: $TensorDesc, + XeGPU_MaskType: $mask, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + + Type getValueType() { + return getValue().getType(); + } + + Type getMaskType() { + return getMask().getType(); + } + }]; + + let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict + `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}]; + + let hasVerifier = 1; +} + +def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset", + [AllTypesMatch<["TensorDesc", "result"]>]> { + let summary = "It updates the offsets for the given tensor descriptor"; + + let description = [{It behaves similar to `update_nd_offset` in terms that + it updates offset of a TensorDesc, and the offsets are relative offset to + the current position in the number of elements. However, `update_nd_offset` + is to update the start point of a 2D block, so its offset constains two + elements representing the shift in each dimension. `update_offset` is to + update the offset per work-item, so its offsets contains values representing + shifts for each work-item. + + Example: + ``` + %2 = xegpu.update_offset %1, [32, 32, 32, 32] + : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + ``` + }]; + + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + Variadic: $offsets, + DenseI64ArrayAttr: $const_offsets); + let results = (outs XeGPU_TensorDesc: $result); + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + + SmallVector getMixedOffsets() { + Builder b(getContext()); + return getMixedValues(getConstOffsets(), getOffsets(), b); + } + + size_t getNumOffsets() { + return getMixedOffsets().size(); + } + + OpFoldResult getOffset(unsigned idx) { + assert(idx < getNumOffsets() && "Invalid out of bound access."); + return getMixedOffsets()[idx]; + } + }]; + + let assemblyFormat = [{ + $TensorDesc `,` + custom($offsets, $const_offsets) + attr-dict `:` qualified(type($TensorDesc)) + }]; +} + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 19ac1693712dd8..4cd4e5411653c1 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -34,10 +34,10 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", [ShapedTypeInterface], "::mlir::TensorType"> { let summary = "TensorDesc describing regions of interested data."; let description = [{ - TensorDesc is a type designed to describe regions of the interested data as well as some - features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, - it essentially only contains the meta data, and doesn't hold the data by itself. It is designed - to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU. + TensorDesc is a type designed to describe regions of the interested data as well as some + features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, + it essentially only contains the meta data, and doesn't hold the data by itself. It is designed + to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU. It encodes the following information: * shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows @@ -46,15 +46,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", is set or not. * element_type: the data type of the data element, e.g., f16, f32. - Similar to the builtin tensor, it also provides an optinal attribute to encoding + Similar to the builtin tensor, it also provides an optinal attribute to encoding the following information via the TensorDescAttr object: - * memory_scope (xegpu::MemoryScope): [optional] where the data is located, + * memory_scope (xegpu::MemoryScope): [optional] where the data is located, global memory or shared memory. It is default to Global. * array_length (int): [optional] The number of contiguous blocks with size as `shape`, that will be loaded by block load at a time. It is default to 1. - * boundary_check (bool): [optional] indicates whether the operation detects the boundary + * boundary_check (bool): [optional] indicates whether the operation detects the boundary and pads with zero for out-of-boundary access. It is default to do boundary check. - + Syntax: @@ -63,7 +63,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", element-type ::= float-type | integer-type | index-type dim-list := (static-dim-list `x`)? static-dim-list ::= decimal-literal `x` decimal-literal - attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)? + attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)? ``` Examples: @@ -84,6 +84,17 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", "mlir::Type": $elementType, OptionalParameter<"mlir::Attribute">: $encoding); + let builders = [ + TypeBuilderWithInferredContext<(ins + "llvm::ArrayRef": $shape, + "mlir::Type": $elementType, + CArg<"bool", "false">: $scattered, + CArg<"int", "1">: $array_length, + CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, + CArg<"bool", "true">: $boundary_check + )> + ]; + let extraClassDeclaration = [{ using TensorType::clone; using mlir::ShapedType::Trait::getElementTypeBitWidth; @@ -116,7 +127,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", if (attr && attr.getArrayLength()) return attr.getArrayLength().getInt(); // return default value - return 1; + return 1; } bool getBoundaryCheck() { @@ -126,10 +137,18 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", // return default value return true; } + + bool getScattered() { + auto attr = getEncodingAsTensorDescAttr(); + if (attr && attr.getScattered()) + return attr.getScattered().getValue(); + // return default value + return false; + } }]; let hasCustomAssemblyFormat = true; - + } #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 0b3f4b9c9dbeae..24719fe748fe4f 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -32,6 +32,17 @@ void XeGPUDialect::initialize() { //===----------------------------------------------------------------------===// // XeGPU_TensorDescAttr //===----------------------------------------------------------------------===// +TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context, + xegpu::MemoryScope memory_scope, + int array_length, bool boundary_check, + bool scattered) { + auto scopeAttr = MemoryScopeAttr::get(context, memory_scope); + auto lengthAttr = + IntegerAttr::get(IntegerType::get(context, 64), array_length); + auto boundaryAttr = BoolAttr::get(context, boundary_check); + auto scatteredAttr = BoolAttr::get(context, scattered); + return Base::get(context, scopeAttr, lengthAttr, boundaryAttr, scatteredAttr); +} //===----------------------------------------------------------------------===// // XeGPU_TensorDescType @@ -96,6 +107,16 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const { printer << ">"; } +TensorDescType TensorDescType::get(llvm::ArrayRef shape, + mlir::Type elementType, bool scattered, + int array_length, MemoryScope memory_scope, + bool boundary_check) { + auto context = elementType.getContext(); + auto attr = TensorDescAttr::get(context, memory_scope, array_length, + boundary_check, scattered); + return Base::get(context, shape, elementType, attr); +} + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 02106f221f3233..621986c54d492c 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -9,6 +9,9 @@ #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/TypeUtilities.h" + +#include "llvm/Support/Debug.h" #define DEBUG_TYPE "xegpu" @@ -16,8 +19,8 @@ namespace mlir { namespace xegpu { static void transpose(llvm::ArrayRef trans, - std::vector &shape) { - std::vector old = shape; + SmallVector &shape) { + SmallVector old = shape; for (size_t i = 0; i < trans.size(); i++) shape[i] = old[trans[i]]; } @@ -38,6 +41,38 @@ static std::string makeString(T array, bool breakline = false) { return buf; } +static SmallVector getShapeOf(Type type) { + SmallVector shape; + if (auto ty = llvm::dyn_cast(type)) + shape = SmallVector(ty.getShape()); + else + shape.push_back(1); + return shape; +} + +static int64_t getRankOf(Value val) { + auto type = val.getType(); + if (auto ty = llvm::dyn_cast(type)) + return ty.getRank(); + return 0; +}; + +static bool isReadHintOrNone(const CachePolicyAttr &attr) { + if (!attr) + return true; + auto kind = attr.getValue(); + return kind == CachePolicy::CACHED || kind == CachePolicy::UNCACHED || + kind == CachePolicy::STREAMING || kind == CachePolicy::READ_INVALIDATE; +} + +static bool isWriteHintOrNone(const CachePolicyAttr &attr) { + if (!attr) + return true; + auto kind = attr.getValue(); + return kind == CachePolicy::CACHED || kind == CachePolicy::UNCACHED || + kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH; +} + //===----------------------------------------------------------------------===// // XeGPU_CreateNdDescOp //===----------------------------------------------------------------------===// @@ -114,6 +149,29 @@ LogicalResult CreateNdDescOp::verify() { return emitOpError("TensorDesc should have the same element " "type with the source if it is a memref.\n"); + if (getType().getScattered()) + return emitOpError("Expects a non-scattered TensorDesc.\n"); + + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_PrefetchNdOp +//===----------------------------------------------------------------------===// +LogicalResult PrefetchNdOp::verify() { + auto tdescTy = getTensorDescType(); + if (tdescTy.getScattered()) + return emitOpError("Expects a non-scattered TensorDesc.\n"); + + if (!isReadHintOrNone(getL1HintAttr())) + return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + + if (!isReadHintOrNone(getL2HintAttr())) + return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + + if (!isReadHintOrNone(getL3HintAttr())) + return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + return success(); } @@ -125,22 +183,26 @@ LogicalResult LoadNdOp::verify() { auto valueTy = getType(); if (tdescTy.getRank() != 2) - return emitOpError( - "The TensorDesc for LoadNdOp should be a 2D TensorDesc."); + return emitOpError("Expecting a 2D TensorDesc.\n"); + + if (tdescTy.getScattered()) + return emitOpError("Expects a non-scattered TensorDesc.\n"); if (!valueTy) return emitOpError("Invalid result, it should be a VectorType.\n"); - auto tdescElemTy = tdescTy.getElementType(); - auto valueElemTy = valueTy.getElementType(); + if (!isReadHintOrNone(getL1HintAttr())) + return emitOpError("invlid l1_hint: ") << getL1HintAttr(); - if (tdescElemTy != valueElemTy) - return emitOpError( - "Value should have the same element type as TensorDesc."); + if (!isReadHintOrNone(getL2HintAttr())) + return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + + if (!isReadHintOrNone(getL3HintAttr())) + return emitOpError("invlid l3_hint: ") << getL3HintAttr(); auto array_len = tdescTy.getArrayLength(); - auto tdescShape = tdescTy.getShape().vec(); - auto valueShape = valueTy.getShape().vec(); + auto tdescShape = getShapeOf(tdescTy); + auto valueShape = getShapeOf(valueTy); if (getTranspose()) { auto trans = getTranspose().value(); @@ -174,26 +236,174 @@ LogicalResult LoadNdOp::verify() { // XeGPU_StoreNdOp //===----------------------------------------------------------------------===// LogicalResult StoreNdOp::verify() { - auto dstTy = getTensorDesc().getType(); // Tile - auto valTy = getValue().getType().cast(); // Vector + auto dstTy = getTensorDescType(); // Tile + auto valTy = getValueType(); // Vector if (dstTy.getRank() != 2) - return emitOpError("Expecting a 2D TensorDesc shape.\n"); + return emitOpError("Expecting a 2D TensorDesc.\n"); + + if (dstTy.getScattered()) + return emitOpError("Expects a non-scattered TensorDesc.\n"); if (!valTy) return emitOpError("Exepcting a VectorType result.\n"); - auto dstElemTy = dstTy.getElementType(); - auto valElemTy = valTy.getElementType(); + if (!isWriteHintOrNone(getL1HintAttr())) + return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + + if (!isWriteHintOrNone(getL2HintAttr())) + return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + + if (!isWriteHintOrNone(getL3HintAttr())) + return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + + return success(); +} - if (dstElemTy != valElemTy) { - return emitOpError() << "The element type of the value should " - "match the elementtype of the TensorDesc.\n"; +//===----------------------------------------------------------------------===// +// XeGPU_UpdateNDOffsetOp +//===----------------------------------------------------------------------===// +LogicalResult UpdateNdOffsetOp::verify() { + auto ty = getTensorDescType(); + if (ty.getScattered()) + return emitOpError("Expects a non-scattered TensorDesc.\n"); + + // number of offsets specified must match the rank of the tensor descriptor + if (ty.getRank() != (int64_t)getNumOffsets()) { + return emitOpError("Invalid number of offsets."); } + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_CreateDescOp +//===----------------------------------------------------------------------===// +void CreateDescOp::build(OpBuilder &builder, OperationState &state, + TensorDescType TensorDesc, Value source, + llvm::ArrayRef offsets, + uint32_t chunk_size) { + llvm::SmallVector staticOffsets; + llvm::SmallVector dynamicOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + build(builder, state, TensorDesc, source, dynamicOffsets, staticOffsets, + chunk_size); +} + +LogicalResult CreateDescOp::verify() { + auto tdescTy = getTensorDescType(); + auto chunkSize = getChunkSize(); + + if (getRankOf(getSource()) > 1) + return emitOpError( + "Expecting the source is a 1D memref or pointer (uint64_t)."); + + if (!tdescTy.getScattered()) + return emitOpError("Expects a scattered TensorDesc.\n"); + + SmallVector shape({(int64_t)getNumOffsets()}); + if (chunkSize != 1) + shape.push_back(chunkSize); + + auto tdescShape = getShapeOf(tdescTy); + if (shape != tdescShape) + return emitOpError("Incorrect TensorDesc shape. ") + << "Expected is " << makeString(shape) << "\n"; + + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_PrefetchOp +//===----------------------------------------------------------------------===// +LogicalResult PrefetchOp::verify() { + auto tdescTy = getTensorDescType(); + if (!tdescTy.getScattered()) + return emitOpError("Expects a scattered TensorDesc.\n"); + + if (!isReadHintOrNone(getL1HintAttr())) + return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + + if (!isReadHintOrNone(getL2HintAttr())) + return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + + if (!isReadHintOrNone(getL3HintAttr())) + return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_LoadGatherOp +//===----------------------------------------------------------------------===// +LogicalResult LoadGatherOp::verify() { + auto tdescTy = getTensorDescType(); + auto maskTy = getMaskType(); + auto valueTy = getValueType(); + + if (!tdescTy.getScattered()) + return emitOpError("Expects a scattered TensorDesc.\n"); + + if (!isReadHintOrNone(getL1HintAttr())) + return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + + if (!isReadHintOrNone(getL2HintAttr())) + return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + + if (!isReadHintOrNone(getL3HintAttr())) + return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + + auto tdescElemTy = tdescTy.getElementType(); + auto valueElemTy = getElementType(); + if (tdescElemTy != valueElemTy) + return emitOpError( + "Value should have the same element type as TensorDesc."); + + auto maskShape = getShapeOf(maskTy); + auto valueShape = getShapeOf(valueTy); + auto tdescShape = getShapeOf(tdescTy); + + if (tdescShape[0] != maskShape[0]) + return emitOpError("dim-0 of the Mask and TensorDesc should be the same."); + + if (getTransposeAttr()) { + auto trans = getTranspose().value(); + if (tdescShape.size() < trans.size()) + emitWarning("Invalid transpose attr. It is ignored."); + else + transpose(trans, tdescShape); + } + + if (valueShape != tdescShape) + return emitOpError("Unexpected result shape") + << "(Expected shape: " << makeString(tdescShape) + << ", Given shape: " << makeString(valueShape) << ").\n"; + + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_StoreScatterOp +//===----------------------------------------------------------------------===// +LogicalResult StoreScatterOp::verify() { + auto tdescTy = getTensorDescType(); + if (!tdescTy.getScattered()) + return emitOpError("Expects a scattered TensorDesc.\n"); + + if (!isWriteHintOrNone(getL1HintAttr())) + return emitOpError("invlid l1_hint: ") << getL1HintAttr(); + + if (!isWriteHintOrNone(getL2HintAttr())) + return emitOpError("invlid l2_hint: ") << getL2HintAttr(); + + if (!isWriteHintOrNone(getL3HintAttr())) + return emitOpError("invlid l3_hint: ") << getL3HintAttr(); + + auto maskTy = getMaskType(); + auto maskShape = getShapeOf(maskTy); + auto tdescShape = getShapeOf(tdescTy); + if (tdescShape[0] != maskShape[0]) + return emitOpError("dim-0 of the Mask and TensorDesc should be the same."); - if (dstTy.getShape() != valTy.getShape()) - return emitOpError() - << "The result shape should match the TensorDesc shape.\n"; return success(); } diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir index 039346adbb851c..f0945c79a94ac3 100644 --- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir +++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir @@ -59,4 +59,66 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { gpu.return } +// CHECK: gpu.func @test_create_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @test_create_update_nd_tdesc_vc(%src: memref<24x32xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32> + %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32> + gpu.return +} + +// CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) { +gpu.func @test_create_tdesc_vc(%src: ui64) { + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + gpu.return +} + +// CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) { +gpu.func @test_prefetch_vc(%src: ui64) { + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + gpu.return +} + +// CHECK: gpu.func @test_load_gather_vc(%[[arg0:.*]]: ui64) { +gpu.func @test_load_gather_vc(%src: ui64) { + //CHECK: %[[cst:.*]] = arith.constant dense : vector<4xi1> + %0 = arith.constant dense<1>: vector<4xi1> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + //CHECK-SAME: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> -> vector<4x2xf32> + %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> -> vector<4x2xf32> + gpu.return +} + +// CHECK: gpu.func @test_store_scatter_vc(%[[arg0:.*]]: ui64) { +gpu.func @test_store_scatter_vc(%src: ui64) { + //CHECK: %[[c0:.*]] = arith.constant dense : vector<4xi1> + %0 = arith.constant dense<1>: vector<4xi1> + //CHECK: %[[c1:.*]] = arith.constant dense<2.900000e+00> : vector<4x2xf32> + %1 = arith.constant dense<2.9>: vector<4x2xf32> + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + %2 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + //CHECK: xegpu.store %[[c1]], %[[R0]], %[[c0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + //CHECK-SAME: vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> + xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> + gpu.return +} + +// CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) { +gpu.func @test_create_update_tdesc_vc(%src: ui64) { + //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + %2 = xegpu.update_offset %1, [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + gpu.return +} + } \ No newline at end of file diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir new file mode 100644 index 00000000000000..5e29361ec69087 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -0,0 +1,159 @@ +// RUN: mlir-opt %s -split-input-file -verify-diagnostics + +// ----- +func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) { + // expected-error@+1 {{Expecting the rank of shape, strides, offsets, source memref type (if source is a memref) and TensorDesc should match with each other. They currenlty are 2D.}} + %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32> + return +} + +// ----- + +func.func @test_create_nd_tdesc_vc_2(%src: memref<24x32xf32>) { + // expected-error@+1 {{TensorDesc should have the same element type with the source if it is a memref}} + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16> + return +} + +// ----- +func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) { + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16> + return +} + +// ----- +func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) { + %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7] + : memref<24xf16> -> !xegpu.tensor_desc<8xf16, #xegpu.tdesc_attr> + // expected-error@+1 {{Expects a non-scattered TensorDesc}} + xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<8xf16, #xegpu.tdesc_attr> + return +} + +// ----- +func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) { + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> + return +} + +// ----- +func.func @test_load_nd_vc_2(%src: memref<16xf16>) { + %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} + : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> + // expected-error@+1 {{Expects a non-scattered TensorDesc.}} + %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> -> vector<8x2xf16> + return +} + +// ----- +func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) { + %1 = arith.constant dense<1.0>: vector<24x32xf16> + %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + return +} + +// ----- +func.func @test_store_nd_vc_2(%dst: memref<16xf16>) { + %1 = arith.constant dense<1.0>: vector<8x2xf16> + %2 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} + : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> + // expected-error@+1 {{Expects a non-scattered TensorDesc}} + xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint}> + : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> + return +} + +// ----- +func.func @test_update_nd_offset_1(%dst: memref<16xf16>) { + %1 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} + : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> + // expected-error@+1 {{Expects a non-scattered TensorDesc}} + xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr> + return +} + +// ----- +func.func @test_create_tdesc_vc_1(%src: ui64) { + // expected-error@+1 {{Expects a scattered TensorDesc}} + %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} + : ui64 -> !xegpu.tensor_desc<8x2xf16> + return +} + +// ----- +func.func @test_create_tdesc_vc_2(%src: ui64) { + // expected-error@+1 {{Incorrect TensorDesc shape}} + %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2} + : ui64 -> !xegpu.tensor_desc<8x4xf16, #xegpu.tdesc_attr> + return +} + +// ----- +func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) { + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + // expected-error@+1 {{Expects a scattered TensorDesc}} + xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<24x32xf16> + return +} + +// ----- +func.func @test_prefetch_vc_2(%src: ui64) { + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + return +} + +// ----- +func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) { + %0 = arith.constant dense<1>: vector<4xi1> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16> + // expected-error@+1 {{Expects a scattered TensorDesc}} + %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<4x2xf16>, vector<4xi1> -> vector<4x2xf16> + return +} + +// ----- +func.func @test_load_gather_vc_2(%src: ui64) { + %0 = arith.constant dense<1>: vector<4xi1> + %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 + -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> + -> vector<4x2xf32> + return +} + +// ----- +func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) { + %0 = arith.constant dense<1>: vector<4xi1> + %1 = arith.constant dense<2.9>: vector<4x2xf32> + %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32> + // expected-error@+1 {{Expects a scattered TensorDesc}} + xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint}> + : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32>, vector<4xi1> + return +} + +// ----- +func.func @test_store_scatter_vc_2(%src: ui64) { + %0 = arith.constant dense<1>: vector<4xi1> + %1 = arith.constant dense<2.9>: vector<4x2xf32> + %2 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} + : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr> + // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint}} + xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint}> : vector<4x2xf32>, + !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr>, vector<4xi1> + return +} \ No newline at end of file From 9ec8c961664de3b3fcc1cbd5238e40ec8c9bdddb Mon Sep 17 00:00:00 2001 From: Samira Bazuzi Date: Tue, 16 Apr 2024 14:46:05 -0400 Subject: [PATCH 152/300] [clang][dataflow] Expose getReferencedDecls and relocate free functions. (#88754) Moves free functions from DataflowEnvironment.h/cc and DataflowAnalysisContext.h/cc to RecordOps and a new ASTOps and exposes them as needed for current use and to expose getReferencedDecls for out-of-tree use. Minimal change in functionality, only to modify the return type of getReferenceDecls to return the collected decls instead of using output params. Tested with `ninja check-clang-tooling`. --- clang/docs/tools/clang-formatted-files.txt | 2 + .../clang/Analysis/FlowSensitive/ASTOps.h | 98 +++++++ .../FlowSensitive/DataflowAnalysisContext.h | 28 +- .../FlowSensitive/DataflowEnvironment.h | 36 --- clang/lib/Analysis/FlowSensitive/ASTOps.cpp | 249 ++++++++++++++++++ .../lib/Analysis/FlowSensitive/CMakeLists.txt | 1 + .../FlowSensitive/DataflowAnalysisContext.cpp | 53 +--- .../FlowSensitive/DataflowEnvironment.cpp | 177 +------------ clang/lib/Analysis/FlowSensitive/Transfer.cpp | 2 + 9 files changed, 359 insertions(+), 287 deletions(-) create mode 100644 clang/include/clang/Analysis/FlowSensitive/ASTOps.h create mode 100644 clang/lib/Analysis/FlowSensitive/ASTOps.cpp diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt index 3089438c23d94e..2252d0ccde96d2 100644 --- a/clang/docs/tools/clang-formatted-files.txt +++ b/clang/docs/tools/clang-formatted-files.txt @@ -123,6 +123,7 @@ clang/include/clang/Analysis/Analyses/CalledOnceCheck.h clang/include/clang/Analysis/Analyses/CFGReachabilityAnalysis.h clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h +clang/include/clang/Analysis/FlowSensitive/ASTOps.h clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h @@ -307,6 +308,7 @@ clang/lib/Analysis/CalledOnceCheck.cpp clang/lib/Analysis/CloneDetection.cpp clang/lib/Analysis/CodeInjector.cpp clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp +clang/lib/Analysis/FlowSensitive/ASTOps.cpp clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp clang/lib/Analysis/FlowSensitive/DebugSupport.cpp diff --git a/clang/include/clang/Analysis/FlowSensitive/ASTOps.h b/clang/include/clang/Analysis/FlowSensitive/ASTOps.h new file mode 100644 index 00000000000000..27ad32c1694f77 --- /dev/null +++ b/clang/include/clang/Analysis/FlowSensitive/ASTOps.h @@ -0,0 +1,98 @@ +//===-- ASTOps.h -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Operations on AST nodes that are used in flow-sensitive analysis. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ASTOPS_H +#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ASTOPS_H + +#include "clang/AST/Decl.h" +#include "clang/AST/Expr.h" +#include "clang/AST/Type.h" +#include "clang/Analysis/FlowSensitive/StorageLocation.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetVector.h" + +namespace clang { +namespace dataflow { + +/// Skip past nodes that the CFG does not emit. These nodes are invisible to +/// flow-sensitive analysis, and should be ignored as they will effectively not +/// exist. +/// +/// * `ParenExpr` - The CFG takes the operator precedence into account, but +/// otherwise omits the node afterwards. +/// +/// * `ExprWithCleanups` - The CFG will generate the appropriate calls to +/// destructors and then omit the node. +/// +const Expr &ignoreCFGOmittedNodes(const Expr &E); +const Stmt &ignoreCFGOmittedNodes(const Stmt &S); + +/// A set of `FieldDecl *`. Use `SmallSetVector` to guarantee deterministic +/// iteration order. +using FieldSet = llvm::SmallSetVector; + +/// Returns the set of all fields in the type. +FieldSet getObjectFields(QualType Type); + +/// Returns whether `Fields` and `FieldLocs` contain the same fields. +bool containsSameFields(const FieldSet &Fields, + const RecordStorageLocation::FieldToLoc &FieldLocs); + +/// Helper class for initialization of a record with an `InitListExpr`. +/// `InitListExpr::inits()` contains the initializers for both the base classes +/// and the fields of the record; this helper class separates these out into two +/// different lists. In addition, it deals with special cases associated with +/// unions. +class RecordInitListHelper { +public: + // `InitList` must have record type. + RecordInitListHelper(const InitListExpr *InitList); + + // Base classes with their associated initializer expressions. + ArrayRef> base_inits() const { + return BaseInits; + } + + // Fields with their associated initializer expressions. + ArrayRef> field_inits() const { + return FieldInits; + } + +private: + SmallVector> BaseInits; + SmallVector> FieldInits; + + // We potentially synthesize an `ImplicitValueInitExpr` for unions. It's a + // member variable because we store a pointer to it in `FieldInits`. + std::optional ImplicitValueInitForUnion; +}; + +/// A collection of several types of declarations, all referenced from the same +/// function. +struct ReferencedDecls { + /// Non-static member variables. + FieldSet Fields; + /// All variables with static storage duration, notably including static + /// member variables and static variables declared within a function. + llvm::DenseSet Globals; + /// Free functions and member functions which are referenced (but not + /// necessarily called). + llvm::DenseSet Functions; +}; + +/// Returns declarations that are declared in or referenced from `FD`. +ReferencedDecls getReferencedDecls(const FunctionDecl &FD); + +} // namespace dataflow +} // namespace clang + +#endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ASTOPS_H diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h index 909a91059438ca..aa2c366cb164a9 100644 --- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h +++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h @@ -18,6 +18,7 @@ #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" #include "clang/AST/TypeOrdering.h" +#include "clang/Analysis/FlowSensitive/ASTOps.h" #include "clang/Analysis/FlowSensitive/AdornedCFG.h" #include "clang/Analysis/FlowSensitive/Arena.h" #include "clang/Analysis/FlowSensitive/Solver.h" @@ -30,38 +31,11 @@ #include #include #include -#include -#include -#include namespace clang { namespace dataflow { class Logger; -/// Skip past nodes that the CFG does not emit. These nodes are invisible to -/// flow-sensitive analysis, and should be ignored as they will effectively not -/// exist. -/// -/// * `ParenExpr` - The CFG takes the operator precedence into account, but -/// otherwise omits the node afterwards. -/// -/// * `ExprWithCleanups` - The CFG will generate the appropriate calls to -/// destructors and then omit the node. -/// -const Expr &ignoreCFGOmittedNodes(const Expr &E); -const Stmt &ignoreCFGOmittedNodes(const Stmt &S); - -/// A set of `FieldDecl *`. Use `SmallSetVector` to guarantee deterministic -/// iteration order. -using FieldSet = llvm::SmallSetVector; - -/// Returns the set of all fields in the type. -FieldSet getObjectFields(QualType Type); - -/// Returns whether `Fields` and `FieldLocs` contain the same fields. -bool containsSameFields(const FieldSet &Fields, - const RecordStorageLocation::FieldToLoc &FieldLocs); - struct ContextSensitiveOptions { /// The maximum depth to analyze. A value of zero is equivalent to disabling /// context-sensitive analysis entirely. diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h index 706664d7db1c25..4277792219c0af 100644 --- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h +++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h @@ -775,42 +775,6 @@ RecordStorageLocation *getImplicitObjectLocation(const CXXMemberCallExpr &MCE, RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME, const Environment &Env); -/// Returns the fields of a `RecordDecl` that are initialized by an -/// `InitListExpr`, in the order in which they appear in -/// `InitListExpr::inits()`. -/// `Init->getType()` must be a record type. -std::vector -getFieldsForInitListExpr(const InitListExpr *InitList); - -/// Helper class for initialization of a record with an `InitListExpr`. -/// `InitListExpr::inits()` contains the initializers for both the base classes -/// and the fields of the record; this helper class separates these out into two -/// different lists. In addition, it deals with special cases associated with -/// unions. -class RecordInitListHelper { -public: - // `InitList` must have record type. - RecordInitListHelper(const InitListExpr *InitList); - - // Base classes with their associated initializer expressions. - ArrayRef> base_inits() const { - return BaseInits; - } - - // Fields with their associated initializer expressions. - ArrayRef> field_inits() const { - return FieldInits; - } - -private: - SmallVector> BaseInits; - SmallVector> FieldInits; - - // We potentially synthesize an `ImplicitValueInitExpr` for unions. It's a - // member variable because we store a pointer to it in `FieldInits`. - std::optional ImplicitValueInitForUnion; -}; - /// Associates a new `RecordValue` with `Loc` and returns the new value. RecordValue &refreshRecordValue(RecordStorageLocation &Loc, Environment &Env); diff --git a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp new file mode 100644 index 00000000000000..75188aef4d1a43 --- /dev/null +++ b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp @@ -0,0 +1,249 @@ +//===-- ASTOps.cc -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Operations on AST nodes that are used in flow-sensitive analysis. +// +//===----------------------------------------------------------------------===// + +#include "clang/Analysis/FlowSensitive/ASTOps.h" +#include "clang/AST/ComputeDependence.h" +#include "clang/AST/Decl.h" +#include "clang/AST/DeclBase.h" +#include "clang/AST/DeclCXX.h" +#include "clang/AST/Expr.h" +#include "clang/AST/ExprCXX.h" +#include "clang/AST/Stmt.h" +#include "clang/AST/Type.h" +#include "clang/Analysis/FlowSensitive/StorageLocation.h" +#include "clang/Basic/LLVM.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include +#include +#include + +#define DEBUG_TYPE "dataflow" + +namespace clang::dataflow { + +const Expr &ignoreCFGOmittedNodes(const Expr &E) { + const Expr *Current = &E; + if (auto *EWC = dyn_cast(Current)) { + Current = EWC->getSubExpr(); + assert(Current != nullptr); + } + Current = Current->IgnoreParens(); + assert(Current != nullptr); + return *Current; +} + +const Stmt &ignoreCFGOmittedNodes(const Stmt &S) { + if (auto *E = dyn_cast(&S)) + return ignoreCFGOmittedNodes(*E); + return S; +} + +// FIXME: Does not precisely handle non-virtual diamond inheritance. A single +// field decl will be modeled for all instances of the inherited field. +static void getFieldsFromClassHierarchy(QualType Type, FieldSet &Fields) { + if (Type->isIncompleteType() || Type->isDependentType() || + !Type->isRecordType()) + return; + + for (const FieldDecl *Field : Type->getAsRecordDecl()->fields()) + Fields.insert(Field); + if (auto *CXXRecord = Type->getAsCXXRecordDecl()) + for (const CXXBaseSpecifier &Base : CXXRecord->bases()) + getFieldsFromClassHierarchy(Base.getType(), Fields); +} + +/// Gets the set of all fields in the type. +FieldSet getObjectFields(QualType Type) { + FieldSet Fields; + getFieldsFromClassHierarchy(Type, Fields); + return Fields; +} + +bool containsSameFields(const FieldSet &Fields, + const RecordStorageLocation::FieldToLoc &FieldLocs) { + if (Fields.size() != FieldLocs.size()) + return false; + for ([[maybe_unused]] auto [Field, Loc] : FieldLocs) + if (!Fields.contains(cast_or_null(Field))) + return false; + return true; +} + +/// Returns the fields of a `RecordDecl` that are initialized by an +/// `InitListExpr`, in the order in which they appear in +/// `InitListExpr::inits()`. +/// `Init->getType()` must be a record type. +static std::vector +getFieldsForInitListExpr(const InitListExpr *InitList) { + const RecordDecl *RD = InitList->getType()->getAsRecordDecl(); + assert(RD != nullptr); + + std::vector Fields; + + if (InitList->getType()->isUnionType()) { + Fields.push_back(InitList->getInitializedFieldInUnion()); + return Fields; + } + + // Unnamed bitfields are only used for padding and do not appear in + // `InitListExpr`'s inits. However, those fields do appear in `RecordDecl`'s + // field list, and we thus need to remove them before mapping inits to + // fields to avoid mapping inits to the wrongs fields. + llvm::copy_if( + RD->fields(), std::back_inserter(Fields), + [](const FieldDecl *Field) { return !Field->isUnnamedBitfield(); }); + return Fields; +} + +RecordInitListHelper::RecordInitListHelper(const InitListExpr *InitList) { + auto *RD = InitList->getType()->getAsCXXRecordDecl(); + assert(RD != nullptr); + + std::vector Fields = getFieldsForInitListExpr(InitList); + ArrayRef Inits = InitList->inits(); + + // Unions initialized with an empty initializer list need special treatment. + // For structs/classes initialized with an empty initializer list, Clang + // puts `ImplicitValueInitExpr`s in `InitListExpr::inits()`, but for unions, + // it doesn't do this -- so we create an `ImplicitValueInitExpr` ourselves. + SmallVector InitsForUnion; + if (InitList->getType()->isUnionType() && Inits.empty()) { + assert(Fields.size() == 1); + ImplicitValueInitForUnion.emplace(Fields.front()->getType()); + InitsForUnion.push_back(&*ImplicitValueInitForUnion); + Inits = InitsForUnion; + } + + size_t InitIdx = 0; + + assert(Fields.size() + RD->getNumBases() == Inits.size()); + for (const CXXBaseSpecifier &Base : RD->bases()) { + assert(InitIdx < Inits.size()); + Expr *Init = Inits[InitIdx++]; + BaseInits.emplace_back(&Base, Init); + } + + assert(Fields.size() == Inits.size() - InitIdx); + for (const FieldDecl *Field : Fields) { + assert(InitIdx < Inits.size()); + Expr *Init = Inits[InitIdx++]; + FieldInits.emplace_back(Field, Init); + } +} + +static void insertIfGlobal(const Decl &D, + llvm::DenseSet &Globals) { + if (auto *V = dyn_cast(&D)) + if (V->hasGlobalStorage()) + Globals.insert(V); +} + +static void insertIfFunction(const Decl &D, + llvm::DenseSet &Funcs) { + if (auto *FD = dyn_cast(&D)) + Funcs.insert(FD); +} + +static MemberExpr *getMemberForAccessor(const CXXMemberCallExpr &C) { + // Use getCalleeDecl instead of getMethodDecl in order to handle + // pointer-to-member calls. + const auto *MethodDecl = dyn_cast_or_null(C.getCalleeDecl()); + if (!MethodDecl) + return nullptr; + auto *Body = dyn_cast_or_null(MethodDecl->getBody()); + if (!Body || Body->size() != 1) + return nullptr; + if (auto *RS = dyn_cast(*Body->body_begin())) + if (auto *Return = RS->getRetValue()) + return dyn_cast(Return->IgnoreParenImpCasts()); + return nullptr; +} + +static void getReferencedDecls(const Decl &D, ReferencedDecls &Referenced) { + insertIfGlobal(D, Referenced.Globals); + insertIfFunction(D, Referenced.Functions); + if (const auto *Decomp = dyn_cast(&D)) + for (const auto *B : Decomp->bindings()) + if (auto *ME = dyn_cast_or_null(B->getBinding())) + // FIXME: should we be using `E->getFoundDecl()`? + if (const auto *FD = dyn_cast(ME->getMemberDecl())) + Referenced.Fields.insert(FD); +} + +/// Traverses `S` and inserts into `Referenced` any declarations that are +/// declared in or referenced from sub-statements. +static void getReferencedDecls(const Stmt &S, ReferencedDecls &Referenced) { + for (auto *Child : S.children()) + if (Child != nullptr) + getReferencedDecls(*Child, Referenced); + if (const auto *DefaultArg = dyn_cast(&S)) + getReferencedDecls(*DefaultArg->getExpr(), Referenced); + if (const auto *DefaultInit = dyn_cast(&S)) + getReferencedDecls(*DefaultInit->getExpr(), Referenced); + + if (auto *DS = dyn_cast(&S)) { + if (DS->isSingleDecl()) + getReferencedDecls(*DS->getSingleDecl(), Referenced); + else + for (auto *D : DS->getDeclGroup()) + getReferencedDecls(*D, Referenced); + } else if (auto *E = dyn_cast(&S)) { + insertIfGlobal(*E->getDecl(), Referenced.Globals); + insertIfFunction(*E->getDecl(), Referenced.Functions); + } else if (const auto *C = dyn_cast(&S)) { + // If this is a method that returns a member variable but does nothing else, + // model the field of the return value. + if (MemberExpr *E = getMemberForAccessor(*C)) + if (const auto *FD = dyn_cast(E->getMemberDecl())) + Referenced.Fields.insert(FD); + } else if (auto *E = dyn_cast(&S)) { + // FIXME: should we be using `E->getFoundDecl()`? + const ValueDecl *VD = E->getMemberDecl(); + insertIfGlobal(*VD, Referenced.Globals); + insertIfFunction(*VD, Referenced.Functions); + if (const auto *FD = dyn_cast(VD)) + Referenced.Fields.insert(FD); + } else if (auto *InitList = dyn_cast(&S)) { + if (InitList->getType()->isRecordType()) + for (const auto *FD : getFieldsForInitListExpr(InitList)) + Referenced.Fields.insert(FD); + } +} + +ReferencedDecls getReferencedDecls(const FunctionDecl &FD) { + ReferencedDecls Result; + // Look for global variable and field references in the + // constructor-initializers. + if (const auto *CtorDecl = dyn_cast(&FD)) { + for (const auto *Init : CtorDecl->inits()) { + if (Init->isMemberInitializer()) { + Result.Fields.insert(Init->getMember()); + } else if (Init->isIndirectMemberInitializer()) { + for (const auto *I : Init->getIndirectMember()->chain()) + Result.Fields.insert(cast(I)); + } + const Expr *E = Init->getInit(); + assert(E != nullptr); + getReferencedDecls(*E, Result); + } + // Add all fields mentioned in default member initializers. + for (const FieldDecl *F : CtorDecl->getParent()->fields()) + if (const auto *I = F->getInClassInitializer()) + getReferencedDecls(*I, Result); + } + getReferencedDecls(*FD.getBody(), Result); + + return Result; +} + +} // namespace clang::dataflow diff --git a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt index a3b5d9adc24bda..6631fe27f3d901 100644 --- a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt +++ b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt @@ -1,6 +1,7 @@ add_clang_library(clangAnalysisFlowSensitive AdornedCFG.cpp Arena.cpp + ASTOps.cpp DataflowAnalysisContext.cpp DataflowEnvironment.cpp Formula.cpp diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp index d520539dd25355..e94fd39c45dc15 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp @@ -14,6 +14,7 @@ #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h" #include "clang/AST/ExprCXX.h" +#include "clang/Analysis/FlowSensitive/ASTOps.h" #include "clang/Analysis/FlowSensitive/DebugSupport.h" #include "clang/Analysis/FlowSensitive/Formula.h" #include "clang/Analysis/FlowSensitive/Logger.h" @@ -359,55 +360,3 @@ DataflowAnalysisContext::~DataflowAnalysisContext() = default; } // namespace dataflow } // namespace clang - -using namespace clang; - -const Expr &clang::dataflow::ignoreCFGOmittedNodes(const Expr &E) { - const Expr *Current = &E; - if (auto *EWC = dyn_cast(Current)) { - Current = EWC->getSubExpr(); - assert(Current != nullptr); - } - Current = Current->IgnoreParens(); - assert(Current != nullptr); - return *Current; -} - -const Stmt &clang::dataflow::ignoreCFGOmittedNodes(const Stmt &S) { - if (auto *E = dyn_cast(&S)) - return ignoreCFGOmittedNodes(*E); - return S; -} - -// FIXME: Does not precisely handle non-virtual diamond inheritance. A single -// field decl will be modeled for all instances of the inherited field. -static void getFieldsFromClassHierarchy(QualType Type, - clang::dataflow::FieldSet &Fields) { - if (Type->isIncompleteType() || Type->isDependentType() || - !Type->isRecordType()) - return; - - for (const FieldDecl *Field : Type->getAsRecordDecl()->fields()) - Fields.insert(Field); - if (auto *CXXRecord = Type->getAsCXXRecordDecl()) - for (const CXXBaseSpecifier &Base : CXXRecord->bases()) - getFieldsFromClassHierarchy(Base.getType(), Fields); -} - -/// Gets the set of all fields in the type. -clang::dataflow::FieldSet clang::dataflow::getObjectFields(QualType Type) { - FieldSet Fields; - getFieldsFromClassHierarchy(Type, Fields); - return Fields; -} - -bool clang::dataflow::containsSameFields( - const clang::dataflow::FieldSet &Fields, - const clang::dataflow::RecordStorageLocation::FieldToLoc &FieldLocs) { - if (Fields.size() != FieldLocs.size()) - return false; - for ([[maybe_unused]] auto [Field, Loc] : FieldLocs) - if (!Fields.contains(cast_or_null(Field))) - return false; - return true; -} diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp index ee2581143e1141..3bf3807268bee9 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp @@ -17,6 +17,7 @@ #include "clang/AST/DeclCXX.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/Type.h" +#include "clang/Analysis/FlowSensitive/ASTOps.h" #include "clang/Analysis/FlowSensitive/DataflowLattice.h" #include "clang/Analysis/FlowSensitive/Value.h" #include "llvm/ADT/DenseMap.h" @@ -304,93 +305,6 @@ widenKeyToValueMap(const llvm::MapVector &CurMap, return WidenedMap; } -/// Initializes a global storage value. -static void insertIfGlobal(const Decl &D, - llvm::DenseSet &Vars) { - if (auto *V = dyn_cast(&D)) - if (V->hasGlobalStorage()) - Vars.insert(V); -} - -static void insertIfFunction(const Decl &D, - llvm::DenseSet &Funcs) { - if (auto *FD = dyn_cast(&D)) - Funcs.insert(FD); -} - -static MemberExpr *getMemberForAccessor(const CXXMemberCallExpr &C) { - // Use getCalleeDecl instead of getMethodDecl in order to handle - // pointer-to-member calls. - const auto *MethodDecl = dyn_cast_or_null(C.getCalleeDecl()); - if (!MethodDecl) - return nullptr; - auto *Body = dyn_cast_or_null(MethodDecl->getBody()); - if (!Body || Body->size() != 1) - return nullptr; - if (auto *RS = dyn_cast(*Body->body_begin())) - if (auto *Return = RS->getRetValue()) - return dyn_cast(Return->IgnoreParenImpCasts()); - return nullptr; -} - -static void -getFieldsGlobalsAndFuncs(const Decl &D, FieldSet &Fields, - llvm::DenseSet &Vars, - llvm::DenseSet &Funcs) { - insertIfGlobal(D, Vars); - insertIfFunction(D, Funcs); - if (const auto *Decomp = dyn_cast(&D)) - for (const auto *B : Decomp->bindings()) - if (auto *ME = dyn_cast_or_null(B->getBinding())) - // FIXME: should we be using `E->getFoundDecl()`? - if (const auto *FD = dyn_cast(ME->getMemberDecl())) - Fields.insert(FD); -} - -/// Traverses `S` and inserts into `Fields`, `Vars` and `Funcs` any fields, -/// global variables and functions that are declared in or referenced from -/// sub-statements. -static void -getFieldsGlobalsAndFuncs(const Stmt &S, FieldSet &Fields, - llvm::DenseSet &Vars, - llvm::DenseSet &Funcs) { - for (auto *Child : S.children()) - if (Child != nullptr) - getFieldsGlobalsAndFuncs(*Child, Fields, Vars, Funcs); - if (const auto *DefaultArg = dyn_cast(&S)) - getFieldsGlobalsAndFuncs(*DefaultArg->getExpr(), Fields, Vars, Funcs); - if (const auto *DefaultInit = dyn_cast(&S)) - getFieldsGlobalsAndFuncs(*DefaultInit->getExpr(), Fields, Vars, Funcs); - - if (auto *DS = dyn_cast(&S)) { - if (DS->isSingleDecl()) - getFieldsGlobalsAndFuncs(*DS->getSingleDecl(), Fields, Vars, Funcs); - else - for (auto *D : DS->getDeclGroup()) - getFieldsGlobalsAndFuncs(*D, Fields, Vars, Funcs); - } else if (auto *E = dyn_cast(&S)) { - insertIfGlobal(*E->getDecl(), Vars); - insertIfFunction(*E->getDecl(), Funcs); - } else if (const auto *C = dyn_cast(&S)) { - // If this is a method that returns a member variable but does nothing else, - // model the field of the return value. - if (MemberExpr *E = getMemberForAccessor(*C)) - if (const auto *FD = dyn_cast(E->getMemberDecl())) - Fields.insert(FD); - } else if (auto *E = dyn_cast(&S)) { - // FIXME: should we be using `E->getFoundDecl()`? - const ValueDecl *VD = E->getMemberDecl(); - insertIfGlobal(*VD, Vars); - insertIfFunction(*VD, Funcs); - if (const auto *FD = dyn_cast(VD)) - Fields.insert(FD); - } else if (auto *InitList = dyn_cast(&S)) { - if (InitList->getType()->isRecordType()) - for (const auto *FD : getFieldsForInitListExpr(InitList)) - Fields.insert(FD); - } -} - namespace { // Visitor that builds a map from record prvalues to result objects. @@ -653,36 +567,13 @@ void Environment::initialize() { void Environment::initFieldsGlobalsAndFuncs(const FunctionDecl *FuncDecl) { assert(FuncDecl->doesThisDeclarationHaveABody()); - FieldSet Fields; - llvm::DenseSet Vars; - llvm::DenseSet Funcs; - - // Look for global variable and field references in the - // constructor-initializers. - if (const auto *CtorDecl = dyn_cast(FuncDecl)) { - for (const auto *Init : CtorDecl->inits()) { - if (Init->isMemberInitializer()) { - Fields.insert(Init->getMember()); - } else if (Init->isIndirectMemberInitializer()) { - for (const auto *I : Init->getIndirectMember()->chain()) - Fields.insert(cast(I)); - } - const Expr *E = Init->getInit(); - assert(E != nullptr); - getFieldsGlobalsAndFuncs(*E, Fields, Vars, Funcs); - } - // Add all fields mentioned in default member initializers. - for (const FieldDecl *F : CtorDecl->getParent()->fields()) - if (const auto *I = F->getInClassInitializer()) - getFieldsGlobalsAndFuncs(*I, Fields, Vars, Funcs); - } - getFieldsGlobalsAndFuncs(*FuncDecl->getBody(), Fields, Vars, Funcs); + ReferencedDecls Referenced = getReferencedDecls(*FuncDecl); // These have to be added before the lines that follow to ensure that // `create*` work correctly for structs. - DACtx->addModeledFields(Fields); + DACtx->addModeledFields(Referenced.Fields); - for (const VarDecl *D : Vars) { + for (const VarDecl *D : Referenced.Globals) { if (getStorageLocation(*D) != nullptr) continue; @@ -694,7 +585,7 @@ void Environment::initFieldsGlobalsAndFuncs(const FunctionDecl *FuncDecl) { setStorageLocation(*D, createObject(*D, nullptr)); } - for (const FunctionDecl *FD : Funcs) { + for (const FunctionDecl *FD : Referenced.Functions) { if (getStorageLocation(*FD) != nullptr) continue; auto &Loc = createStorageLocation(*FD); @@ -1354,64 +1245,6 @@ RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME, return Env.get(*Base); } -std::vector -getFieldsForInitListExpr(const InitListExpr *InitList) { - const RecordDecl *RD = InitList->getType()->getAsRecordDecl(); - assert(RD != nullptr); - - std::vector Fields; - - if (InitList->getType()->isUnionType()) { - Fields.push_back(InitList->getInitializedFieldInUnion()); - return Fields; - } - - // Unnamed bitfields are only used for padding and do not appear in - // `InitListExpr`'s inits. However, those fields do appear in `RecordDecl`'s - // field list, and we thus need to remove them before mapping inits to - // fields to avoid mapping inits to the wrongs fields. - llvm::copy_if( - RD->fields(), std::back_inserter(Fields), - [](const FieldDecl *Field) { return !Field->isUnnamedBitfield(); }); - return Fields; -} - -RecordInitListHelper::RecordInitListHelper(const InitListExpr *InitList) { - auto *RD = InitList->getType()->getAsCXXRecordDecl(); - assert(RD != nullptr); - - std::vector Fields = getFieldsForInitListExpr(InitList); - ArrayRef Inits = InitList->inits(); - - // Unions initialized with an empty initializer list need special treatment. - // For structs/classes initialized with an empty initializer list, Clang - // puts `ImplicitValueInitExpr`s in `InitListExpr::inits()`, but for unions, - // it doesn't do this -- so we create an `ImplicitValueInitExpr` ourselves. - SmallVector InitsForUnion; - if (InitList->getType()->isUnionType() && Inits.empty()) { - assert(Fields.size() == 1); - ImplicitValueInitForUnion.emplace(Fields.front()->getType()); - InitsForUnion.push_back(&*ImplicitValueInitForUnion); - Inits = InitsForUnion; - } - - size_t InitIdx = 0; - - assert(Fields.size() + RD->getNumBases() == Inits.size()); - for (const CXXBaseSpecifier &Base : RD->bases()) { - assert(InitIdx < Inits.size()); - Expr *Init = Inits[InitIdx++]; - BaseInits.emplace_back(&Base, Init); - } - - assert(Fields.size() == Inits.size() - InitIdx); - for (const FieldDecl *Field : Fields) { - assert(InitIdx < Inits.size()); - Expr *Init = Inits[InitIdx++]; - FieldInits.emplace_back(Field, Init); - } -} - RecordValue &refreshRecordValue(RecordStorageLocation &Loc, Environment &Env) { auto &NewVal = Env.create(Loc); Env.setValue(Loc, NewVal); diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp index 88a9c0eccbebc0..1e034771014eaa 100644 --- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp +++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp @@ -20,7 +20,9 @@ #include "clang/AST/OperationKinds.h" #include "clang/AST/Stmt.h" #include "clang/AST/StmtVisitor.h" +#include "clang/Analysis/FlowSensitive/ASTOps.h" #include "clang/Analysis/FlowSensitive/AdornedCFG.h" +#include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h" #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h" #include "clang/Analysis/FlowSensitive/NoopAnalysis.h" #include "clang/Analysis/FlowSensitive/RecordOps.h" From bbd64c4ddf08be468ab4eb4c161e28bdab6808bb Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 16 Apr 2024 11:40:23 -0700 Subject: [PATCH 153/300] [RISCV] Add coverage for strength reduction of mul as 2^N - 2^M --- llvm/test/CodeGen/RISCV/mul.ll | 196 ++++++++++++++++++++++++++++++++- 1 file changed, 192 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll index af341dbaadeabd..364e8c7b38dacc 100644 --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -465,6 +465,192 @@ define i32 @mulhu_constant(i32 %a) nounwind { ret i32 %4 } +define i32 @muli32_p14(i32 %a) nounwind { +; RV32I-LABEL: muli32_p14: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 14 +; RV32I-NEXT: tail __mulsi3 +; +; RV32IM-LABEL: muli32_p14: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 14 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: muli32_p14: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: li a1, 14 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: muli32_p14: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 14 +; RV64IM-NEXT: mulw a0, a0, a1 +; RV64IM-NEXT: ret + %1 = mul i32 %a, 14 + ret i32 %1 +} + +define i32 @muli32_p28(i32 %a) nounwind { +; RV32I-LABEL: muli32_p28: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 28 +; RV32I-NEXT: tail __mulsi3 +; +; RV32IM-LABEL: muli32_p28: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 28 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: muli32_p28: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: li a1, 28 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: muli32_p28: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 28 +; RV64IM-NEXT: mulw a0, a0, a1 +; RV64IM-NEXT: ret + %1 = mul i32 %a, 28 + ret i32 %1 +} + +define i32 @muli32_p30(i32 %a) nounwind { +; RV32I-LABEL: muli32_p30: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 30 +; RV32I-NEXT: tail __mulsi3 +; +; RV32IM-LABEL: muli32_p30: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 30 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: muli32_p30: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: li a1, 30 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: muli32_p30: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 30 +; RV64IM-NEXT: mulw a0, a0, a1 +; RV64IM-NEXT: ret + %1 = mul i32 %a, 30 + ret i32 %1 +} + +define i32 @muli32_p56(i32 %a) nounwind { +; RV32I-LABEL: muli32_p56: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 56 +; RV32I-NEXT: tail __mulsi3 +; +; RV32IM-LABEL: muli32_p56: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 56 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: muli32_p56: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: li a1, 56 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: muli32_p56: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 56 +; RV64IM-NEXT: mulw a0, a0, a1 +; RV64IM-NEXT: ret + %1 = mul i32 %a, 56 + ret i32 %1 +} + +define i32 @muli32_p60(i32 %a) nounwind { +; RV32I-LABEL: muli32_p60: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 60 +; RV32I-NEXT: tail __mulsi3 +; +; RV32IM-LABEL: muli32_p60: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 60 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: muli32_p60: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: li a1, 60 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: muli32_p60: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 60 +; RV64IM-NEXT: mulw a0, a0, a1 +; RV64IM-NEXT: ret + %1 = mul i32 %a, 60 + ret i32 %1 +} + +define i32 @muli32_p62(i32 %a) nounwind { +; RV32I-LABEL: muli32_p62: +; RV32I: # %bb.0: +; RV32I-NEXT: li a1, 62 +; RV32I-NEXT: tail __mulsi3 +; +; RV32IM-LABEL: muli32_p62: +; RV32IM: # %bb.0: +; RV32IM-NEXT: li a1, 62 +; RV32IM-NEXT: mul a0, a0, a1 +; RV32IM-NEXT: ret +; +; RV64I-LABEL: muli32_p62: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: li a1, 62 +; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret +; +; RV64IM-LABEL: muli32_p62: +; RV64IM: # %bb.0: +; RV64IM-NEXT: li a1, 62 +; RV64IM-NEXT: mulw a0, a0, a1 +; RV64IM-NEXT: ret + %1 = mul i32 %a, 62 + ret i32 %1 +} + define i32 @muli32_p65(i32 %a) nounwind { ; RV32I-LABEL: muli32_p65: ; RV32I: # %bb.0: @@ -600,6 +786,8 @@ define i64 @muli64_p63(i64 %a) nounwind { ret i64 %1 } + + define i32 @muli32_m63(i32 %a) nounwind { ; RV32I-LABEL: muli32_m63: ; RV32I: # %bb.0: @@ -1145,10 +1333,10 @@ define i128 @muli128_m3840(i128 %a) nounwind { ; RV32I-NEXT: sltu a7, a6, a4 ; RV32I-NEXT: sub t0, t1, t0 ; RV32I-NEXT: mv t1, a7 -; RV32I-NEXT: beq a5, a3, .LBB30_2 +; RV32I-NEXT: beq a5, a3, .LBB36_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sltu t1, a5, a3 -; RV32I-NEXT: .LBB30_2: +; RV32I-NEXT: .LBB36_2: ; RV32I-NEXT: sub a2, a2, a1 ; RV32I-NEXT: sltu a1, a2, t1 ; RV32I-NEXT: sub a1, t0, a1 @@ -1261,10 +1449,10 @@ define i128 @muli128_m63(i128 %a) nounwind { ; RV32I-NEXT: slli t0, a1, 6 ; RV32I-NEXT: or a7, t0, a7 ; RV32I-NEXT: mv t0, a5 -; RV32I-NEXT: beq a1, a7, .LBB31_2 +; RV32I-NEXT: beq a1, a7, .LBB37_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sltu t0, a1, a7 -; RV32I-NEXT: .LBB31_2: +; RV32I-NEXT: .LBB37_2: ; RV32I-NEXT: srli t1, a1, 26 ; RV32I-NEXT: slli t2, a6, 6 ; RV32I-NEXT: or t1, t2, t1 From 8885813ebb0a61014d99ac776b8118d935848cc9 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Tue, 16 Apr 2024 20:45:12 +0200 Subject: [PATCH 154/300] [libc++][chrono] Disables a test. This tests seems problematic on different platforms. There is still a test that ensures coverage, but in an automatic fashion. This test needs to be investigated. --- .../time.zone.members/get_info.sys_time.pass.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp index a751a2fb6347b5..d27cf0bd89062e 100644 --- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp +++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp @@ -6,7 +6,10 @@ // //===----------------------------------------------------------------------===// -// UNSUPPORTED: c++03, c++11, c++14, c++17 +// TODO TZDB review the test based on review comments in +// https://github.com/llvm/llvm-project/pull/85619 + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23, c++26 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb // XFAIL: libcpp-has-no-incomplete-tzdb From 0665669876cd7f51f7572cff3bb97485d78f5de5 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 16 Apr 2024 11:49:25 -0700 Subject: [PATCH 155/300] [Sema] Mark alias/ifunc targets used and consider mangled names https://reviews.llvm.org/D54188 marked "alias" targets as used in C to fix -Wunused false positives. This patch extends the approach to handle mangled names to support global scope names in C++ and the `overloadable` attribute in C. (Note: we should skip `UsingShadowDecl`, which would trigger an assertion failure in `ItaniumMangleContextImpl::mangleCXXName`. See regression test added by commit 1c2afbae9af22b58190c10e3517242d01d89d612.) In addition, we mark ifunc targets as used to fix #63957 (temporarily used by xz; ifunc was removed by https://github.com/tukaani-project/xz/commit/689ae2427342a2ea1206eb5ca08301baf410e7e0) While our approach has false negatives for namespace scope names, the majority of alias/ifunc C++ uses (global scope with no overloads) are handled. Note: The following function with internal linkage but C language linkage type is mangled in Clang but not in GCC. This inconsistency makes alias/ifunc difficult to use in C++ with portability (#88593). ``` extern "C" { static void f0() {} // GCC: void g0() __attribute__((alias("_ZL2f0v"))); // Clang: void g0() __attribute__((alias("f0"))); } ``` Pull Request: https://github.com/llvm/llvm-project/pull/87130 --- clang/lib/Sema/CMakeLists.txt | 1 + clang/lib/Sema/SemaDeclAttr.cpp | 46 ++++++++++++++----- clang/test/AST/ast-dump-attr-json.cpp | 1 + clang/test/Sema/alias-unused-win.cpp | 2 +- clang/test/Sema/alias-unused.cpp | 16 ++++--- .../llvm-project-overlay/clang/BUILD.bazel | 1 + 6 files changed, 48 insertions(+), 19 deletions(-) diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt index ab3b813a9ccd97..a96439df664228 100644 --- a/clang/lib/Sema/CMakeLists.txt +++ b/clang/lib/Sema/CMakeLists.txt @@ -1,5 +1,6 @@ set(LLVM_LINK_COMPONENTS Core + Demangle FrontendHLSL FrontendOpenMP MC diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index b7b1fbc625a150..c3bf18a3f79e23 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -45,6 +45,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/STLForwardCompat.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Demangle/Demangle.h" #include "llvm/IR/Assumptions.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Support/Error.h" @@ -1983,6 +1984,38 @@ static void handleWeakRefAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(::new (S.Context) WeakRefAttr(S.Context, AL)); } +// Mark alias/ifunc target as used. Due to name mangling, we look up the +// demangled name ignoring parameters (not supported by microsoftDemangle +// https://github.com/llvm/llvm-project/issues/88825). This should handle the +// majority of use cases while leaving namespace scope names unmarked. +static void markUsedForAliasOrIfunc(Sema &S, Decl *D, const ParsedAttr &AL, + StringRef Str) { + std::unique_ptr Demangled; + if (S.getASTContext().getCXXABIKind() != TargetCXXABI::Microsoft) + Demangled.reset(llvm::itaniumDemangle(Str, /*ParseParams=*/false)); + std::unique_ptr MC(S.Context.createMangleContext()); + SmallString<256> Name; + + const DeclarationNameInfo Target( + &S.Context.Idents.get(Demangled ? Demangled.get() : Str), AL.getLoc()); + LookupResult LR(S, Target, Sema::LookupOrdinaryName); + if (S.LookupName(LR, S.TUScope)) { + for (NamedDecl *ND : LR) { + if (!isa(ND) && !isa(ND)) + continue; + if (MC->shouldMangleDeclName(ND)) { + llvm::raw_svector_ostream Out(Name); + Name.clear(); + MC->mangleName(GlobalDecl(ND), Out); + } else { + Name = ND->getIdentifier()->getName(); + } + if (Name == Str) + ND->markUsed(S.Context); + } + } +} + static void handleIFuncAttr(Sema &S, Decl *D, const ParsedAttr &AL) { StringRef Str; if (!S.checkStringLiteralArgumentAttr(AL, 0, Str)) @@ -1995,6 +2028,7 @@ static void handleIFuncAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } + markUsedForAliasOrIfunc(S, D, AL, Str); D->addAttr(::new (S.Context) IFuncAttr(S.Context, AL, Str)); } @@ -2029,17 +2063,7 @@ static void handleAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { } } - // Mark target used to prevent unneeded-internal-declaration warnings. - if (!S.LangOpts.CPlusPlus) { - // FIXME: demangle Str for C++, as the attribute refers to the mangled - // linkage name, not the pre-mangled identifier. - const DeclarationNameInfo target(&S.Context.Idents.get(Str), AL.getLoc()); - LookupResult LR(S, target, Sema::LookupOrdinaryName); - if (S.LookupQualifiedName(LR, S.getCurLexicalContext())) - for (NamedDecl *ND : LR) - ND->markUsed(S.Context); - } - + markUsedForAliasOrIfunc(S, D, AL, Str); D->addAttr(::new (S.Context) AliasAttr(S.Context, AL, Str)); } diff --git a/clang/test/AST/ast-dump-attr-json.cpp b/clang/test/AST/ast-dump-attr-json.cpp index 051c2956abfdf7..883e584bfedf07 100644 --- a/clang/test/AST/ast-dump-attr-json.cpp +++ b/clang/test/AST/ast-dump-attr-json.cpp @@ -46,6 +46,7 @@ __thread __attribute__ ((tls_model ("local-exec"))) int tls_model_var; // CHECK-NEXT: "tokLen": 11 // CHECK-NEXT: } // CHECK-NEXT: }, +// CHECK-NEXT: "isUsed": true, // CHECK-NEXT: "name": "global_decl", // CHECK-NEXT: "mangledName": "global_decl", // CHECK-NEXT: "type": { diff --git a/clang/test/Sema/alias-unused-win.cpp b/clang/test/Sema/alias-unused-win.cpp index 47c96d41175179..97d57a3bbd1e31 100644 --- a/clang/test/Sema/alias-unused-win.cpp +++ b/clang/test/Sema/alias-unused-win.cpp @@ -7,7 +7,7 @@ extern "C" { static int f(void) { return 42; } // cxx-warning{{unused function 'f'}} int g(void) __attribute__((alias("f"))); -static int foo [] = { 42, 0xDEAD }; // cxx-warning{{variable 'foo' is not needed and will not be emitted}} +static int foo [] = { 42, 0xDEAD }; extern typeof(foo) bar __attribute__((unused, alias("foo"))); static int __attribute__((overloadable)) f0(int x) { return x; } // expected-warning{{unused function 'f0'}} diff --git a/clang/test/Sema/alias-unused.cpp b/clang/test/Sema/alias-unused.cpp index dc8e46f072d74d..c0b541c880e525 100644 --- a/clang/test/Sema/alias-unused.cpp +++ b/clang/test/Sema/alias-unused.cpp @@ -14,24 +14,26 @@ extern typeof(foo) bar __attribute__((unused, alias("foo"))); /// We report a warning in C++ mode because the internal linkage `resolver` gets /// mangled as it does not have a language linkage. GCC does not mangle /// `resolver` or report a warning. -static int (*resolver(void))(void) { return f; } // expected-warning{{unused function 'resolver'}} +static int (*resolver(void))(void) { return f; } // cxx-warning{{unused function 'resolver'}} int ifunc(void) __attribute__((ifunc("resolver"))); -static int __attribute__((overloadable)) f0(int x) { return x; } // expected-warning{{unused function 'f0'}} +static int __attribute__((overloadable)) f0(int x) { return x; } static float __attribute__((overloadable)) f0(float x) { return x; } // expected-warning{{unused function 'f0'}} int g0(void) __attribute__((alias("_ZL2f0i"))); #ifdef __cplusplus -static int f1() { return 42; } // expected-warning{{unused function 'f1'}} +static int f1() { return 42; } int g1(void) __attribute__((alias("_ZL2f1v"))); } -static int f2(int) { return 42; } // expected-warning{{unused function 'f2'}} -static int f2() { return 42; } // expected-warning{{unused function 'f2'}} +/// We demangle alias/ifunc target and mark all found functions as used. + +static int f2(int) { return 42; } // cxx-warning{{unused function 'f2'}} +static int f2() { return 42; } int g2() __attribute__((alias("_ZL2f2v"))); -static int (*resolver1())() { return f; } // expected-warning{{unused function 'resolver1'}} -static int (*resolver1(int))() { return f; } // expected-warning{{unused function 'resolver1'}} +static int (*resolver1())() { return f; } // cxx-warning{{unused function 'resolver1'}} +static int (*resolver1(int))() { return f; } int ifunc1() __attribute__((ifunc("_ZL9resolver1i"))); /// TODO: We should report "unused function" for f3(int). diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index c2f77e3abca0e6..725ac6bb38120b 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1136,6 +1136,7 @@ cc_library( "//llvm:AllTargetsAsmParsers", "//llvm:AllTargetsCodeGens", "//llvm:Core", + "//llvm:Demangle", "//llvm:FrontendHLSL", "//llvm:FrontendOpenMP", "//llvm:MC", From 5422eb0b841521908c2fc60bd9c7fdc11ded12a1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 16 Apr 2024 11:50:49 -0700 Subject: [PATCH 156/300] [memprof] Add another constructor to MemProfReader (#88952) This patch enables users of MemProfReader to directly supply mappings from CallStackId to actual call stacks. Once the users of the current constructor without CSIdMap switch to the new constructor, we'll have fewer users of: - IndexedAllocationInfo::CallStack - IndexedMemProfRecord::CallSites bringing us one step closer to the removal of these fields in favor of: - IndexedAllocationInfo::CSId - IndexedMemProfRecord::CallSiteIds --- llvm/include/llvm/ProfileData/MemProfReader.h | 9 ++++ llvm/unittests/ProfileData/MemProfTest.cpp | 41 +++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h index 1f84fefad03e39..7fa8af184dc93b 100644 --- a/llvm/include/llvm/ProfileData/MemProfReader.h +++ b/llvm/include/llvm/ProfileData/MemProfReader.h @@ -98,6 +98,15 @@ class MemProfReader { llvm::DenseMap FrameIdMap, llvm::MapVector ProfData); + // Initialize the MemProfReader with the frame mappings, call stack mappings, + // and profile contents. + MemProfReader( + llvm::DenseMap FrameIdMap, + llvm::DenseMap> CSIdMap, + llvm::MapVector ProfData) + : IdToFrame(std::move(FrameIdMap)), CSIdToCallStack(std::move(CSIdMap)), + FunctionProfileData(std::move(ProfData)) {} + protected: // A helper method to extract the frame from the IdToFrame map. const Frame &idToFrame(const FrameId Id) const { diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp index ab9227e9df881b..f596919ed039a8 100644 --- a/llvm/unittests/ProfileData/MemProfTest.cpp +++ b/llvm/unittests/ProfileData/MemProfTest.cpp @@ -436,6 +436,47 @@ TEST(MemProf, BaseMemProfReader) { FrameContains("bar", 10U, 2U, false)); } +TEST(MemProf, BaseMemProfReaderWithCSIdMap) { + llvm::DenseMap FrameIdMap; + Frame F1(/*Hash=*/IndexedMemProfRecord::getGUID("foo"), /*LineOffset=*/20, + /*Column=*/5, /*IsInlineFrame=*/true); + Frame F2(/*Hash=*/IndexedMemProfRecord::getGUID("bar"), /*LineOffset=*/10, + /*Column=*/2, /*IsInlineFrame=*/false); + FrameIdMap.insert({F1.hash(), F1}); + FrameIdMap.insert({F2.hash(), F2}); + + llvm::DenseMap> CSIdMap; + llvm::SmallVector CallStack = {F1.hash(), F2.hash()}; + CallStackId CSId = llvm::memprof::hashCallStack(CallStack); + CSIdMap.insert({CSId, CallStack}); + + llvm::MapVector ProfData; + IndexedMemProfRecord FakeRecord; + MemInfoBlock Block; + Block.AllocCount = 1U, Block.TotalAccessDensity = 4, + Block.TotalLifetime = 200001; + FakeRecord.AllocSites.emplace_back( + /*CS=*/llvm::SmallVector(), + /*CSId=*/llvm::memprof::hashCallStack(CallStack), + /*MB=*/Block); + ProfData.insert({F1.hash(), FakeRecord}); + + MemProfReader Reader(FrameIdMap, CSIdMap, ProfData); + + llvm::SmallVector Records; + for (const auto &KeyRecordPair : Reader) { + Records.push_back(KeyRecordPair.second); + } + + ASSERT_THAT(Records, SizeIs(1)); + ASSERT_THAT(Records[0].AllocSites, SizeIs(1)); + ASSERT_THAT(Records[0].AllocSites[0].CallStack, SizeIs(2)); + EXPECT_THAT(Records[0].AllocSites[0].CallStack[0], + FrameContains("foo", 20U, 5U, true)); + EXPECT_THAT(Records[0].AllocSites[0].CallStack[1], + FrameContains("bar", 10U, 2U, false)); +} + TEST(MemProf, IndexedMemProfRecordToMemProfRecord) { // Verify that MemProfRecord can be constructed from IndexedMemProfRecord with // CallStackIds only. From c7657cf7d1ee57f9cb9133164536591a1842b43c Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 16 Apr 2024 14:54:06 -0400 Subject: [PATCH 157/300] [SLP]Keep externally used GEPs as GEPs, if possible instead of extractelement. If the vectorized GEP instruction can be still kept as a scalar GEP, better to keep it as scalar instead of extractelement. In many cases it is more profitable. Metric: size..text Program size..text results results0 diff test-suite :: SingleSource/Benchmarks/Misc/oourafft.test 18911.00 19695.00 4.1% test-suite :: SingleSource/Benchmarks/Misc-C++-EH/spirit.test 59987.00 60707.00 1.2% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1392209.00 1392753.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1392209.00 1392753.00 0.0% test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1087996.00 1088236.00 0.0% test-suite :: MultiSource/Benchmarks/Bullet/bullet.test 309310.00 309342.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664661.00 664693.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664661.00 664693.00 0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12354636.00 12354908.00 0.0% test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test 1152748.00 1152716.00 -0.0% test-suite :: MultiSource/Applications/oggenc/oggenc.test 191787.00 191771.00 -0.0% test-suite :: SingleSource/UnitTests/matrix-types-spec.test 480796.00 480476.00 -0.1% Misc/oourafft - Extra code gets vectorized Misc-C++-EH/spirit - same CFP2017speed/638.imagick_s CFP2017rate/538.imagick_r - same, extra code gets vectorized CINT2006/400.perlbench - some extra 4 x ptr stores vectorized Bullet/bullet - extra 4 x ptr store vectorized CINT2017rate/525.x264_r CINT2017speed/625.x264_s - same CFP2017rate/526.blender_r - extra 8 x float stores (several), some extra 4 x ptr stores CFP2006/453.povray - 2 x double loads/stores replaced by 4 x double loads/stores Applications/oggenc - extra code is vectorized UnitTests/matrix-types-spec - extra code gets vectorized Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/88877 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 50 ++++++++++++++++++- .../SLPVectorizer/X86/extract_in_tree_user.ll | 4 +- .../SLPVectorizer/X86/geps-non-pow-2.ll | 17 ++++--- .../SLPVectorizer/X86/opaque-ptr.ll | 19 +++---- .../X86/reorder-reused-masked-gather2.ll | 2 +- .../SLPVectorizer/X86/stacksave-dependence.ll | 4 +- 6 files changed, 71 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0cd3ca32933ca2..7694627c3b0430 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1134,6 +1134,7 @@ class BoUpSLP { MustGather.clear(); EntryToLastInstruction.clear(); ExternalUses.clear(); + ExternalUsesAsGEPs.clear(); for (auto &Iter : BlocksSchedules) { BlockScheduling *BS = Iter.second.get(); BS->clear(); @@ -3154,6 +3155,10 @@ class BoUpSLP { /// after vectorization. UserList ExternalUses; + /// A list of GEPs which can be reaplced by scalar GEPs instead of + /// extractelement instructions. + SmallPtrSet ExternalUsesAsGEPs; + /// Values used only by @llvm.assume calls. SmallPtrSet EphValues; @@ -5541,6 +5546,7 @@ void BoUpSLP::buildExternalUses( << FoundLane << " from " << *Scalar << ".\n"); ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()); ExternalUses.emplace_back(Scalar, nullptr, FoundLane); + continue; } for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); @@ -9925,6 +9931,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { SmallVector DemandedElts; SmallDenseSet UsedInserts; DenseSet> VectorCasts; + std::optional> ValueToExtUses; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!isa_and_nonnull(EU.User) && @@ -10033,12 +10040,40 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { } } } + // Leave the GEPs as is, they are free in most cases and better to keep them + // as GEPs. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + if (auto *GEP = dyn_cast(EU.Scalar)) { + if (!ValueToExtUses) { + ValueToExtUses.emplace(); + for_each(enumerate(ExternalUses), [&](const auto &P) { + ValueToExtUses->try_emplace(P.value().Scalar, P.index()); + }); + } + // Can use original GEP, if no operands vectorized or they are marked as + // externally used already. + bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) { + if (!getTreeEntry(V)) + return true; + auto It = ValueToExtUses->find(V); + if (It != ValueToExtUses->end()) { + // Replace all uses to avoid compiler crash. + ExternalUses[It->second].User = nullptr; + return true; + } + return false; + }); + if (CanBeUsedAsGEP) { + ExtractCost += TTI->getInstructionCost(GEP, CostKind); + ExternalUsesAsGEPs.insert(EU.Scalar); + continue; + } + } // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto It = MinBWs.find(getTreeEntry(EU.Scalar)); if (It != MinBWs.end()) { auto *MinTy = IntegerType::get(F->getContext(), It->second.first); @@ -13161,6 +13196,8 @@ Value *BoUpSLP::vectorizeTree( if (Scalar->getType() != Vec->getType()) { Value *Ex = nullptr; Value *ExV = nullptr; + auto *GEP = dyn_cast(Scalar); + bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP); auto It = ScalarToEEs.find(Scalar); if (It != ScalarToEEs.end()) { // No need to emit many extracts, just move the only one in the @@ -13186,6 +13223,15 @@ Value *BoUpSLP::vectorizeTree( if (const TreeEntry *ETE = getTreeEntry(V)) V = ETE->VectorizedValue; Ex = Builder.CreateExtractElement(V, ES->getIndexOperand()); + } else if (ReplaceGEP) { + // Leave the GEPs as is, they are free in most cases and better to + // keep them as GEPs. + auto *CloneGEP = GEP->clone(); + CloneGEP->insertBefore(*Builder.GetInsertBlock(), + Builder.GetInsertPoint()); + if (GEP->hasName()) + CloneGEP->takeName(GEP); + Ex = CloneGEP; } else { Ex = Builder.CreateExtractElement(Vec, Lane); } @@ -13224,6 +13270,8 @@ Value *BoUpSLP::vectorizeTree( assert((ExternallyUsedValues.count(Scalar) || any_of(Scalar->users(), [&](llvm::User *U) { + if (ExternalUsesAsGEPs.contains(U)) + return true; TreeEntry *UseEntry = getTreeEntry(U); return UseEntry && (UseEntry->State == TreeEntry::Vectorize || diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll index 096f57d100a50f..c600d75ed1e8c4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -13,7 +13,7 @@ define i32 @fn1() { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11 ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64> ; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8 ; CHECK-NEXT: ret i32 undef @@ -92,7 +92,7 @@ define void @externally_used_ptrs() { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11 ; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll index aa679743583064..e459cd8c6955b0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -13,25 +13,26 @@ define dso_local i32 @g() local_unnamed_addr { ; CHECK: while.body: ; CHECK-NEXT: [[C_022:%.*]] = phi ptr [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[C_022]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP9]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 +; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> ; CHECK-NEXT: switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [ -; CHECK-NEXT: i32 2, label [[SW_BB:%.*]] -; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]] +; CHECK-NEXT: i32 2, label [[SW_BB:%.*]] +; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb: ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2 +; CHECK-NEXT: store i32 [[TMP7]], ptr [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: sw.bb6: ; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2 +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1 ; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> @@ -39,7 +40,7 @@ define dso_local i32 @g() local_unnamed_addr { ; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP13]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: while.body.backedge: -; CHECK-NEXT: [[C_022_BE]] = phi ptr [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] +; CHECK-NEXT: [[C_022_BE]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] ; CHECK-NEXT: [[TMP14]] = phi <2 x ptr> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ] ; CHECK-NEXT: br label [[WHILE_BODY]] ; CHECK: while.end: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll index 3801fa5c787b6d..c40be9690cce1d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll @@ -52,17 +52,14 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 { define void @test2(ptr %a, ptr %b) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1 -; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2 -; CHECK-NEXT: [[I1:%.*]] = ptrtoint ptr [[A1]] to i64 -; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3 -; CHECK-NEXT: [[I2:%.*]] = ptrtoint ptr [[B3]] to i64 -; CHECK-NEXT: [[V1:%.*]] = load i64, ptr [[A1]], align 8 -; CHECK-NEXT: [[V2:%.*]] = load i64, ptr [[A2]], align 8 -; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[I1]], [[V1]] -; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[I2]], [[V2]] -; CHECK-NEXT: store i64 [[ADD1]], ptr [[A1]], align 8 -; CHECK-NEXT: store i64 [[ADD2]], ptr [[A2]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> +; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64> +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[A1]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]] +; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr [[A1]], align 8 ; CHECK-NEXT: ret void ; %a1 = getelementptr inbounds i64, ptr %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll index ddc2a1b819041f..30f328293cdaa3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -9,7 +9,7 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 ; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> , <4 x float> poison) ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll index 0125e5fab089b2..e93c5244dfbe2c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll @@ -35,7 +35,7 @@ define void @allocas(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1 ; CHECK-NEXT: store ptr [[TMP4]], ptr [[A:%.*]], align 8 ; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret void @@ -127,7 +127,7 @@ define void @stacksave2(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1 ; CHECK-NEXT: store ptr [[TMP4]], ptr [[A:%.*]], align 8 ; CHECK-NEXT: call void @use(ptr inalloca(i8) [[V2]]) #[[ATTR5:[0-9]+]] ; CHECK-NEXT: call void @llvm.stackrestore.p0(ptr [[STACK]]) From 7d4e8c1f3bbfe976f4871c9cf953f76d771b0eda Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 16 Apr 2024 14:55:41 -0400 Subject: [PATCH 158/300] [SLP]Attempt to vectorize long stores, if short one failed. We can try to vectorize long store sequences, if short ones were unsuccessful because of the non-profitable vectorization. It should not increase compile time significantly (stores are sorted already, complexity is n x log n), but vectorize extra code. Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1088012.00 1088236.00 0.0% test-suite :: SingleSource/UnitTests/matrix-types-spec.test 480396.00 480476.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664613.00 664661.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664613.00 664661.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2041105.00 2040961.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 836563.00 836387.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1035100.00 1032140.00 -0.3% In all benchmarks extra code gets vectorized Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/88563 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 81 ++++++++++++------- .../Transforms/SLPVectorizer/X86/pr46983.ll | 46 +++-------- 2 files changed, 62 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7694627c3b0430..8ae38550d3095d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15237,39 +15237,60 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, Size *= 2; }); unsigned StartIdx = 0; - for (unsigned Size : CandidateVFs) { - for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { - ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); - assert( - all_of( - Slice, - [&](Value *V) { - return cast(V)->getValueOperand()->getType() == - cast(Slice.front()) - ->getValueOperand() - ->getType(); - }) && - "Expected all operands of same type."); - if (!VectorizedStores.count(Slice.front()) && - !VectorizedStores.count(Slice.back()) && - TriedSequences.insert(std::make_pair(Slice.front(), Slice.back())) - .second && - vectorizeStoreChain(Slice, R, Cnt, MinVF)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Slice.begin(), Slice.end()); - Changed = true; - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += Size; - Cnt += Size; - continue; + unsigned Repeat = 0; + constexpr unsigned MaxAttempts = 2; + while (true) { + ++Repeat; + for (unsigned Size : CandidateVFs) { + for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { + ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); + assert( + all_of( + Slice, + [&](Value *V) { + return cast(V)->getValueOperand()->getType() == + cast(Slice.front()) + ->getValueOperand() + ->getType(); + }) && + "Expected all operands of same type."); + if (!VectorizedStores.count(Slice.front()) && + !VectorizedStores.count(Slice.back()) && + TriedSequences + .insert(std::make_pair(Slice.front(), Slice.back())) + .second && + vectorizeStoreChain(Slice, R, Cnt, MinVF)) { + // Mark the vectorized stores so that we don't vectorize them + // again. + VectorizedStores.insert(Slice.begin(), Slice.end()); + Changed = true; + // If we vectorized initial block, no need to try to vectorize + // it again. + if (Cnt == StartIdx) + StartIdx += Size; + Cnt += Size; + continue; + } + ++Cnt; + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Operands.size()) { + Repeat = MaxAttempts; + break; } - ++Cnt; } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= Operands.size()) + // Check if tried all attempts or no need for the last attempts at all. + if (Repeat >= MaxAttempts) break; + const unsigned MaxTotalNum = bit_floor(Operands.size() - StartIdx); + if (MaxVF >= MaxTotalNum) + break; + // Last attempt to vectorize max number of elements, if all previous + // attempts were unsuccessful because of the cost issues. + CandidateVFs.clear(); + for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2) { + CandidateVFs.push_back(Size); + } } } }; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll index 75505f632a43f3..3deab0975ce764 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -100,41 +100,17 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) { define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) { ; SSE-LABEL: @store_i64( ; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] -; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 -; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 -; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 -; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 -; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 -; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 -; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] -; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 -; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 -; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 -; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 -; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16 -; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] -; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 -; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 -; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 -; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 -; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 -; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24 -; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] -; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 -; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 -; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 -; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 -; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 -; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] +; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], +; SSE-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; SSE-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], +; SSE-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; SSE-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> +; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64> +; SSE-NEXT: store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @store_i64( From 3eff86f82cb59d7dfc88e0cc3d8df8282f24f028 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 16 Apr 2024 18:59:12 +0000 Subject: [PATCH 159/300] [gn build] Port 9ec8c961664d --- .../utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn index 04f20211b3c710..22433459a78786 100644 --- a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn @@ -23,6 +23,7 @@ static_library("FlowSensitive") { target_gen_dir, ] sources = [ + "ASTOps.cpp", "AdornedCFG.cpp", "Arena.cpp", "DataflowAnalysisContext.cpp", From b0ddbfb77d15e00e08fc36f6ccd8a4fecde465d1 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Tue, 16 Apr 2024 12:09:32 -0700 Subject: [PATCH 160/300] [clang][SPIR-V] Set AS for the SPIR-V logical triple (#88939) This was missed in #88455, causing most of the .hlsl to SPIR-V tests to fail (such as clang\test\Driver\hlsl-lang-targets-spirv.hlsl) --- clang/lib/Basic/Targets/SPIR.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 9a4a8b501460b6..44265445ff004b 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -315,7 +315,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRVTargetInfo : public BaseSPIRVTargetInfo { // SPIR-V IDs are represented with a single 32-bit word. SizeType = TargetInfo::UnsignedInt; resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-" - "v96:128-v192:256-v256:256-v512:512-v1024:1024"); + "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"); } void getTargetDefines(const LangOptions &Opts, From c9731a3dccd381849bfede5e09290c0574efa248 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 16 Apr 2024 12:10:05 -0700 Subject: [PATCH 161/300] [mlir] Fix a warning about an extraneous semicolon This patch fixes: mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp:58:2: error: extra ';' outside of a function is incompatible with C++98 [-Werror,-Wc++98-compat-extra-semi] --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 621986c54d492c..530c50ef74f7a0 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -55,7 +55,7 @@ static int64_t getRankOf(Value val) { if (auto ty = llvm::dyn_cast(type)) return ty.getRank(); return 0; -}; +} static bool isReadHintOrNone(const CachePolicyAttr &attr) { if (!attr) From 0a789ea8a829da345e46d8224d73b2ddaba6969f Mon Sep 17 00:00:00 2001 From: erichkeane Date: Tue, 16 Apr 2024 12:12:25 -0700 Subject: [PATCH 162/300] Fix test from #83124 and #88902 This just replaces an '#include' with a declaration of array placement new. --- clang/test/SemaCXX/PR41441.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/clang/test/SemaCXX/PR41441.cpp b/clang/test/SemaCXX/PR41441.cpp index 0b012b33fce343..d0f2917e52f211 100644 --- a/clang/test/SemaCXX/PR41441.cpp +++ b/clang/test/SemaCXX/PR41441.cpp @@ -1,6 +1,9 @@ // RUN: %clang --target=x86_64-pc-linux -S -fno-discard-value-names -emit-llvm -o - %s | FileCheck %s -#include +namespace std { + using size_t = decltype(sizeof(int)); +}; +void* operator new[](std::size_t, void*) noexcept; // CHECK: call void @llvm.memset.p0.i64(ptr align 1 %x, i8 0, i64 8, i1 false) // CHECK: call void @llvm.memset.p0.i64(ptr align 16 %x, i8 0, i64 32, i1 false) From 9a0a28f8384b2cb534953df33bf124f01f0e0d0e Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 16 Apr 2024 14:19:12 -0500 Subject: [PATCH 163/300] [Libomptarget] Rework Record & Replay to be a plugin member (#88928) Summary: Previously, the R&R support was global state initialized by a global constructor. This is bad because it prevents us from adequately constraining the lifetime of the library. Additionally, we want to minimize the amount of global state floating around. This patch moves the R&R support into a plugin member like everything else. This means there will be multiple copies of the R&R implementation floating around, but this was already the case given the fact that we currently handle everything with dynamic libraries. --- .../common/include/PluginInterface.h | 11 ++++++ .../common/src/PluginInterface.cpp | 39 ++++++++++++------- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h index 79e8464bfda5c1..7f05464f36c1f3 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h @@ -45,6 +45,8 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Triple.h" +struct RecordReplayTy; + namespace llvm { namespace omp { namespace target { @@ -1031,6 +1033,12 @@ struct GenericPluginTy { return *RPCServer; } + /// Get a reference to the R&R interface for this plugin. + RecordReplayTy &getRecordAndReplay() const { + assert(RecordReplay && "R&R not initialized"); + return *RecordReplay; + } + /// Get the OpenMP requires flags set for this plugin. int64_t getRequiresFlags() const { return RequiresFlags; } @@ -1220,6 +1228,9 @@ struct GenericPluginTy { /// The interface between the plugin and the GPU for host services. RPCServerTy *RPCServer; + + /// The interface into the record-and-replay functionality. + RecordReplayTy *RecordReplay; }; namespace Plugin { diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp index b5f3c45c835fdb..6df9798f12e3d0 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp @@ -362,8 +362,6 @@ struct RecordReplayTy { } }; -static RecordReplayTy RecordReplay; - // Extract the mapping of host function pointers to device function pointers // from the entry table. Functions marked as 'indirect' in OpenMP will have // offloading entries generated for them which map the host's function pointer @@ -473,7 +471,8 @@ GenericKernelTy::getKernelLaunchEnvironment( // Ctor/Dtor have no arguments, replaying uses the original kernel launch // environment. Older versions of the compiler do not generate a kernel // launch environment. - if (isCtorOrDtor() || RecordReplay.isReplaying() || + if (isCtorOrDtor() || + GenericDevice.Plugin.getRecordAndReplay().isReplaying() || Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) return nullptr; @@ -562,6 +561,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, // Record the kernel description after we modified the argument count and num // blocks/threads. + RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordAndReplay(); if (RecordReplay.isRecording()) { RecordReplay.saveImage(getName(), getImage()); RecordReplay.saveKernelInput(getName(), getImage()); @@ -839,9 +839,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { delete MemoryManager; MemoryManager = nullptr; - if (RecordReplay.isRecordingOrReplaying()) - RecordReplay.deinit(); - if (RPCServer) if (auto Err = RPCServer->deinitDevice(*this)) return Err; @@ -858,6 +855,7 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { return deinitImpl(); } + Expected GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, const __tgt_device_image *InputTgtImage) { @@ -892,7 +890,8 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, return std::move(Err); // Setup the global device memory pool if needed. - if (!RecordReplay.isReplaying() && shouldSetupDeviceMemoryPool()) { + if (!Plugin.getRecordAndReplay().isReplaying() && + shouldSetupDeviceMemoryPool()) { uint64_t HeapSize; auto SizeOrErr = getDeviceHeapSize(HeapSize); if (SizeOrErr) { @@ -1307,8 +1306,8 @@ Expected GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind) { void *Alloc = nullptr; - if (RecordReplay.isRecordingOrReplaying()) - return RecordReplay.alloc(Size); + if (Plugin.getRecordAndReplay().isRecordingOrReplaying()) + return Plugin.getRecordAndReplay().alloc(Size); switch (Kind) { case TARGET_ALLOC_DEFAULT: @@ -1344,7 +1343,7 @@ Expected GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { // Free is a noop when recording or replaying. - if (RecordReplay.isRecordingOrReplaying()) + if (Plugin.getRecordAndReplay().isRecordingOrReplaying()) return Plugin::success(); int Res; @@ -1396,6 +1395,7 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo) { + RecordReplayTy &RecordReplay = Plugin.getRecordAndReplay(); AsyncInfoWrapperTy AsyncInfoWrapper( *this, RecordReplay.isRecordingOrReplaying() ? nullptr : AsyncInfo); @@ -1495,6 +1495,9 @@ Error GenericPluginTy::init() { RPCServer = new RPCServerTy(*this); assert(RPCServer && "Invalid RPC server"); + RecordReplay = new RecordReplayTy(); + assert(RecordReplay && "Invalid Record and Replay handler"); + return Plugin::success(); } @@ -1508,6 +1511,9 @@ Error GenericPluginTy::deinit() { assert(!Devices[DeviceId] && "Device was not deinitialized"); } + if (RecordReplay && RecordReplay->isRecordingOrReplaying()) + RecordReplay->deinit(); + // There is no global handler if no device is available. if (GlobalHandler) delete GlobalHandler; @@ -1515,6 +1521,9 @@ Error GenericPluginTy::deinit() { if (RPCServer) delete RPCServer; + if (RecordReplay) + delete RecordReplay; + // Perform last deinitializations on the plugin. return deinitImpl(); } @@ -1630,12 +1639,12 @@ int32_t GenericPluginTy::initialize_record_replay(int32_t DeviceId, isRecord ? RecordReplayTy::RRStatusTy::RRRecording : RecordReplayTy::RRStatusTy::RRReplaying; - if (auto Err = RecordReplay.init(&Device, MemorySize, VAddr, Status, - SaveOutput, ReqPtrArgOffset)) { + if (auto Err = RecordReplay->init(&Device, MemorySize, VAddr, Status, + SaveOutput, ReqPtrArgOffset)) { REPORT("WARNING RR did not intialize RR-properly with %lu bytes" "(Error: %s)\n", MemorySize, toString(std::move(Err)).data()); - RecordReplay.setStatus(RecordReplayTy::RRStatusTy::RRDeactivated); + RecordReplay->setStatus(RecordReplayTy::RRStatusTy::RRDeactivated); if (!isRecord) { return OFFLOAD_FAIL; @@ -1984,8 +1993,8 @@ int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size, assert(DevicePtr && "Invalid device global's address"); // Save the loaded globals if we are recording. - if (RecordReplay.isRecording()) - RecordReplay.addEntry(Name, Size, *DevicePtr); + if (getRecordAndReplay().isRecording()) + getRecordAndReplay().addEntry(Name, Size, *DevicePtr); return OFFLOAD_SUCCESS; } From ed7038ef334eaccdd4104053005cab52804fbfad Mon Sep 17 00:00:00 2001 From: Jeremy Kun Date: Tue, 16 Apr 2024 12:24:19 -0700 Subject: [PATCH 164/300] specify dialect in polynomial docs (#88933) I figured out how to test this with `make mlir-doc doxygen-mlir` --- mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt index dd0384d8b79d66..79e739953d7cf4 100644 --- a/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt @@ -1,5 +1,5 @@ add_mlir_dialect(Polynomial polynomial) -add_mlir_doc(Polynomial PolynomialDialect Polynomial/ -gen-dialect-doc) +add_mlir_doc(Polynomial PolynomialDialect Polynomial/ -gen-dialect-doc -dialect=polynomial) add_mlir_doc(Polynomial PolynomialOps Polynomial/ -gen-op-doc) add_mlir_doc(Polynomial PolynomialAttributes Dialects/ -gen-attrdef-doc) add_mlir_doc(Polynomial PolynomialTypes Dialects/ -gen-typedef-doc) From 13ea36db166b7007f8b1e84e0827faaf24eb448e Mon Sep 17 00:00:00 2001 From: "juan.vazquez" Date: Tue, 16 Apr 2024 21:42:59 +0200 Subject: [PATCH 165/300] Fix UPCAddressofArraySubscriptGadget::getClaimedVarUseSites() (#88406) UPCAddressofArraySubscriptGadget::getClaimedVarUseSites should skip parentheses when accessing the DeclRefExpr, otherwise a crash happens with parenthesized references. --- clang/lib/Analysis/UnsafeBufferUsage.cpp | 2 +- .../warn-unsafe-buffer-usage-suggestions-crashes.cpp | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-suggestions-crashes.cpp diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp index e03fe1b6830043..c42e70d5b95ac1 100644 --- a/clang/lib/Analysis/UnsafeBufferUsage.cpp +++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp @@ -1114,7 +1114,7 @@ class UPCAddressofArraySubscriptGadget : public FixableGadget { virtual DeclUseList getClaimedVarUseSites() const override { const auto *ArraySubst = cast(Node->getSubExpr()); const auto *DRE = - cast(ArraySubst->getBase()->IgnoreImpCasts()); + cast(ArraySubst->getBase()->IgnoreParenImpCasts()); return {DRE}; } }; diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-suggestions-crashes.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-suggestions-crashes.cpp new file mode 100644 index 00000000000000..bf4faec184ee17 --- /dev/null +++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-suggestions-crashes.cpp @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage \ +// RUN: -fsafe-buffer-usage-suggestions \ +// RUN: %s -verify %s + +char * unsafe_pointer; // expected-warning{{'unsafe_pointer' is an unsafe pointer used for buffer access}} + +void test(char * param) { +} + +void dre_parenthesized() { + test(&(unsafe_pointer)[1]); // no-crash // expected-note{{used in buffer access here}} +} From f430e374462efd94d891fcf9fa09d606343c780f Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 16 Apr 2024 12:47:02 -0700 Subject: [PATCH 166/300] [llvm] Drop unaligned from calls to readNext (NFC) (#88841) Now readNext defaults to unaligned accesses. This patch drops unaligned to improve readability. --- .../llvm/ProfileData/InstrProfReader.h | 4 +- llvm/include/llvm/ProfileData/MemProf.h | 25 ++++----- llvm/lib/MC/MCPseudoProbe.cpp | 2 +- llvm/lib/ProfileData/InstrProf.cpp | 4 +- llvm/lib/ProfileData/InstrProfReader.cpp | 53 +++++++------------ llvm/lib/ProfileData/MemProf.cpp | 25 +++++---- llvm/lib/ProfileData/MemProfReader.cpp | 14 ++--- llvm/lib/ProfileData/SampleProfReader.cpp | 2 +- 8 files changed, 55 insertions(+), 74 deletions(-) diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h index e46570af3873f9..f662febb9216bb 100644 --- a/llvm/include/llvm/ProfileData/InstrProfReader.h +++ b/llvm/include/llvm/ProfileData/InstrProfReader.h @@ -508,9 +508,9 @@ class InstrProfLookupTrait { using namespace support; offset_type KeyLen = - endian::readNext(D); + endian::readNext(D); offset_type DataLen = - endian::readNext(D); + endian::readNext(D); return std::make_pair(KeyLen, DataLen); } diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index 3520034fb1c946..d43fb1c93bb8ef 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -68,7 +68,7 @@ struct PortableMemInfoBlock { switch (Id) { #define MIBEntryDef(NameTag, Name, Type) \ case Meta::Name: { \ - Name = endian::readNext(Ptr); \ + Name = endian::readNext(Ptr); \ } break; #include "llvm/ProfileData/MIBEntryDef.inc" #undef MIBEntryDef @@ -223,13 +223,12 @@ struct Frame { using namespace support; const uint64_t F = - endian::readNext(Ptr); + endian::readNext(Ptr); const uint32_t L = - endian::readNext(Ptr); + endian::readNext(Ptr); const uint32_t C = - endian::readNext(Ptr); - const bool I = - endian::readNext(Ptr); + endian::readNext(Ptr); + const bool I = endian::readNext(Ptr); return Frame(/*Function=*/F, /*LineOffset=*/L, /*Column=*/C, /*IsInlineFrame=*/I); } @@ -482,16 +481,15 @@ class RecordLookupTrait { using namespace support; offset_type KeyLen = - endian::readNext(D); + endian::readNext(D); offset_type DataLen = - endian::readNext(D); + endian::readNext(D); return std::make_pair(KeyLen, DataLen); } uint64_t ReadKey(const unsigned char *D, offset_type /*Unused*/) { using namespace support; - return endian::readNext(D); + return endian::readNext(D); } data_type ReadData(uint64_t K, const unsigned char *D, @@ -623,16 +621,15 @@ class FrameLookupTrait { using namespace support; offset_type KeyLen = - endian::readNext(D); + endian::readNext(D); offset_type DataLen = - endian::readNext(D); + endian::readNext(D); return std::make_pair(KeyLen, DataLen); } uint64_t ReadKey(const unsigned char *D, offset_type /*Unused*/) { using namespace support; - return endian::readNext(D); + return endian::readNext(D); } data_type ReadData(uint64_t K, const unsigned char *D, diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp index eb3894dbb3c254..cec50322bb9f90 100644 --- a/llvm/lib/MC/MCPseudoProbe.cpp +++ b/llvm/lib/MC/MCPseudoProbe.cpp @@ -343,7 +343,7 @@ template ErrorOr MCPseudoProbeDecoder::readUnencodedNumber() { if (Data + sizeof(T) > End) { return std::error_code(); } - T Val = endian::readNext(Data); + T Val = endian::readNext(Data); return ErrorOr(Val); } diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index a5abf63b010f7f..f9ba80bd99c857 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -1135,9 +1135,9 @@ static T swapToHostOrder(const unsigned char *&D, llvm::endianness Orig) { using namespace support; if (Orig == llvm::endianness::little) - return endian::readNext(D); + return endian::readNext(D); else - return endian::readNext(D); + return endian::readNext(D); } static std::unique_ptr allocValueProfData(uint32_t TotalSize) { diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index a35366a106a322..8574a96a1b06fc 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -115,10 +115,9 @@ readBinaryIdsInternal(const MemoryBuffer &DataBuffer, uint64_t BILen = 0; if (Endian == llvm::endianness::little) - BILen = - endian::readNext(BI); + BILen = endian::readNext(BI); else - BILen = endian::readNext(BI); + BILen = endian::readNext(BI); if (BILen == 0) return make_error(instrprof_error::malformed, @@ -923,8 +922,7 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D, // Read hash. if (D + sizeof(uint64_t) >= End) return data_type(); - uint64_t Hash = - endian::readNext(D); + uint64_t Hash = endian::readNext(D); // Initialize number of counters for GET_VERSION(FormatVersion) == 1. uint64_t CountsSize = N / sizeof(uint64_t) - 1; @@ -932,8 +930,7 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D, if (GET_VERSION(FormatVersion) != IndexedInstrProf::ProfVersion::Version1) { if (D + sizeof(uint64_t) > End) return data_type(); - CountsSize = - endian::readNext(D); + CountsSize = endian::readNext(D); } // Read counter values. if (D + CountsSize * sizeof(uint64_t) > End) @@ -943,15 +940,14 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D, CounterBuffer.reserve(CountsSize); for (uint64_t J = 0; J < CountsSize; ++J) CounterBuffer.push_back( - endian::readNext(D)); + endian::readNext(D)); // Read bitmap bytes for GET_VERSION(FormatVersion) > 10. if (GET_VERSION(FormatVersion) > IndexedInstrProf::ProfVersion::Version10) { uint64_t BitmapBytes = 0; if (D + sizeof(uint64_t) > End) return data_type(); - BitmapBytes = - endian::readNext(D); + BitmapBytes = endian::readNext(D); // Read bitmap byte values. if (D + BitmapBytes * sizeof(uint8_t) > End) return data_type(); @@ -959,8 +955,7 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D, BitmapByteBuffer.reserve(BitmapBytes); for (uint64_t J = 0; J < BitmapBytes; ++J) BitmapByteBuffer.push_back(static_cast( - endian::readNext( - D))); + endian::readNext(D))); } DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer), @@ -1256,8 +1251,7 @@ Error IndexedInstrProfReader::readHeader() { // memprof::MemProfVersion0 or the MemProf version number in // memprof::MemProfVersion1. const uint64_t FirstWord = - support::endian::readNext(Ptr); + support::endian::readNext(Ptr); memprof::IndexedVersion Version = memprof::Version0; if (FirstWord == memprof::Version1) { @@ -1282,17 +1276,15 @@ Error IndexedInstrProfReader::readHeader() { const uint64_t RecordTableOffset = Version == memprof::Version0 ? FirstWord - : support::endian::readNext(Ptr); + : support::endian::readNext( + Ptr); // The offset in the stream right before invoking // FrameTableGenerator.Emit. const uint64_t FramePayloadOffset = - support::endian::readNext(Ptr); + support::endian::readNext(Ptr); // The value returned from FrameTableGenerator.Emit. const uint64_t FrameTableOffset = - support::endian::readNext(Ptr); + support::endian::readNext(Ptr); // Read the schema. auto SchemaOr = memprof::readMemProfSchema(Ptr); @@ -1330,8 +1322,7 @@ Error IndexedInstrProfReader::readHeader() { const unsigned char *Ptr = Start + BinaryIdOffset; // Read binary ids size. BinaryIdsSize = - support::endian::readNext(Ptr); + support::endian::readNext(Ptr); if (BinaryIdsSize % sizeof(uint64_t)) return error(instrprof_error::bad_header); // Set the binary ids start. @@ -1348,8 +1339,7 @@ Error IndexedInstrProfReader::readHeader() { const unsigned char *Ptr = Start + VTableNamesOffset; CompressedVTableNamesLen = - support::endian::readNext(Ptr); + support::endian::readNext(Ptr); // Writer first writes the length of compressed string, and then the actual // content. @@ -1369,29 +1359,24 @@ Error IndexedInstrProfReader::readHeader() { if (Ptr + 2 * sizeof(uint64_t) > PtrEnd) return error(instrprof_error::truncated); const uint64_t NumTraces = - support::endian::readNext(Ptr); + support::endian::readNext(Ptr); TemporalProfTraceStreamSize = - support::endian::readNext(Ptr); + support::endian::readNext(Ptr); for (unsigned i = 0; i < NumTraces; i++) { // Expect at least two 64 bit fields: Weight and NumFunctions if (Ptr + 2 * sizeof(uint64_t) > PtrEnd) return error(instrprof_error::truncated); TemporalProfTraceTy Trace; Trace.Weight = - support::endian::readNext(Ptr); + support::endian::readNext(Ptr); const uint64_t NumFunctions = - support::endian::readNext(Ptr); + support::endian::readNext(Ptr); // Expect at least NumFunctions 64 bit fields if (Ptr + NumFunctions * sizeof(uint64_t) > PtrEnd) return error(instrprof_error::truncated); for (unsigned j = 0; j < NumFunctions; j++) { const uint64_t NameRef = - support::endian::readNext(Ptr); + support::endian::readNext(Ptr); Trace.FunctionNameRefs.push_back(NameRef); } TemporalProfTraces.push_back(std::move(Trace)); diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp index 1ca0a02d3cbde1..8e0402dd16e680 100644 --- a/llvm/lib/ProfileData/MemProf.cpp +++ b/llvm/lib/ProfileData/MemProf.cpp @@ -144,14 +144,14 @@ static IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema, // Read the meminfo nodes. const uint64_t NumNodes = - endian::readNext(Ptr); + endian::readNext(Ptr); for (uint64_t I = 0; I < NumNodes; I++) { IndexedAllocationInfo Node; const uint64_t NumFrames = - endian::readNext(Ptr); + endian::readNext(Ptr); for (uint64_t J = 0; J < NumFrames; J++) { const FrameId Id = - endian::readNext(Ptr); + endian::readNext(Ptr); Node.CallStack.push_back(Id); } Node.CSId = hashCallStack(Node.CallStack); @@ -162,15 +162,15 @@ static IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema, // Read the callsite information. const uint64_t NumCtxs = - endian::readNext(Ptr); + endian::readNext(Ptr); for (uint64_t J = 0; J < NumCtxs; J++) { const uint64_t NumFrames = - endian::readNext(Ptr); + endian::readNext(Ptr); llvm::SmallVector Frames; Frames.reserve(NumFrames); for (uint64_t K = 0; K < NumFrames; K++) { const FrameId Id = - endian::readNext(Ptr); + endian::readNext(Ptr); Frames.push_back(Id); } Record.CallSites.push_back(Frames); @@ -188,11 +188,10 @@ static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema, // Read the meminfo nodes. const uint64_t NumNodes = - endian::readNext(Ptr); + endian::readNext(Ptr); for (uint64_t I = 0; I < NumNodes; I++) { IndexedAllocationInfo Node; - Node.CSId = - endian::readNext(Ptr); + Node.CSId = endian::readNext(Ptr); Node.Info.deserialize(Schema, Ptr); Ptr += PortableMemInfoBlock::serializedSize(); Record.AllocSites.push_back(Node); @@ -200,10 +199,10 @@ static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema, // Read the callsite information. const uint64_t NumCtxs = - endian::readNext(Ptr); + endian::readNext(Ptr); for (uint64_t J = 0; J < NumCtxs; J++) { CallStackId CSId = - endian::readNext(Ptr); + endian::readNext(Ptr); Record.CallSiteIds.push_back(CSId); } @@ -263,7 +262,7 @@ Expected readMemProfSchema(const unsigned char *&Buffer) { const unsigned char *Ptr = Buffer; const uint64_t NumSchemaIds = - endian::readNext(Ptr); + endian::readNext(Ptr); if (NumSchemaIds > static_cast(Meta::Size)) { return make_error(instrprof_error::malformed, "memprof schema invalid"); @@ -272,7 +271,7 @@ Expected readMemProfSchema(const unsigned char *&Buffer) { MemProfSchema Result; for (size_t I = 0; I < NumSchemaIds; I++) { const uint64_t Tag = - endian::readNext(Ptr); + endian::readNext(Ptr); if (Tag >= static_cast(Meta::Size)) { return make_error(instrprof_error::malformed, "memprof schema invalid"); diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp index 91556f036c7771..b4d2c6f043f6df 100644 --- a/llvm/lib/ProfileData/MemProfReader.cpp +++ b/llvm/lib/ProfileData/MemProfReader.cpp @@ -86,7 +86,7 @@ llvm::SmallVector readSegmentEntries(const char *Ptr) { using namespace support; const uint64_t NumItemsToRead = - endian::readNext(Ptr); + endian::readNext(Ptr); llvm::SmallVector Items; for (uint64_t I = 0; I < NumItemsToRead; I++) { Items.push_back(*reinterpret_cast( @@ -100,11 +100,11 @@ readMemInfoBlocks(const char *Ptr) { using namespace support; const uint64_t NumItemsToRead = - endian::readNext(Ptr); + endian::readNext(Ptr); llvm::SmallVector> Items; for (uint64_t I = 0; I < NumItemsToRead; I++) { const uint64_t Id = - endian::readNext(Ptr); + endian::readNext(Ptr); const MemInfoBlock MIB = *reinterpret_cast(Ptr); Items.push_back({Id, MIB}); // Only increment by size of MIB since readNext implicitly increments. @@ -117,20 +117,20 @@ CallStackMap readStackInfo(const char *Ptr) { using namespace support; const uint64_t NumItemsToRead = - endian::readNext(Ptr); + endian::readNext(Ptr); CallStackMap Items; for (uint64_t I = 0; I < NumItemsToRead; I++) { const uint64_t StackId = - endian::readNext(Ptr); + endian::readNext(Ptr); const uint64_t NumPCs = - endian::readNext(Ptr); + endian::readNext(Ptr); SmallVector CallStack; CallStack.reserve(NumPCs); for (uint64_t J = 0; J < NumPCs; J++) { CallStack.push_back( - endian::readNext(Ptr)); + endian::readNext(Ptr)); } Items[StackId] = CallStack; diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index 98d0aa794529c5..f91a0e6177ea02 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -503,7 +503,7 @@ ErrorOr SampleProfileReaderBinary::readUnencodedNumber() { } using namespace support; - T Val = endian::readNext(Data); + T Val = endian::readNext(Data); return Val; } From 281d71604f418eb952e967d9dc4b26241b7f96aa Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Tue, 16 Apr 2024 13:48:04 -0600 Subject: [PATCH 167/300] [X86] Change how we treat functions with explicit sections as small/large (#88172) Following #78348, we should treat functions with an explicit section as small, unless the section name is (or has the prefix) ".ltext". Clang emits global initializers into a ".text.startup" section on Linux. If we mix small/medium code model object files with large code model object files, we'll end up mixing sections with and without the large section flag. Reland of #87838 with a check for non-ELF platforms in TargetMachine::isLargeGlobalValue(), otherwise MCJIT on Windows tests fail. --- llvm/lib/Target/TargetMachine.cpp | 26 ++++++++++++++----- .../X86/code-model-elf-text-sections.ll | 23 ++++++++++++++++ 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp index a7fe329b064ee1..8ddc742004292b 100644 --- a/llvm/lib/Target/TargetMachine.cpp +++ b/llvm/lib/Target/TargetMachine.cpp @@ -43,6 +43,12 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const { if (getTargetTriple().getArch() != Triple::x86_64) return false; + // Remaining logic below is ELF-specific. For other object file formats where + // the large code model is mostly used for JIT compilation, just look at the + // code model. + if (!getTargetTriple().isOSBinFormatELF()) + return getCodeModel() == CodeModel::Large; + auto *GO = GVal->getAliaseeObject(); // Be conservative if we can't find an underlying GlobalObject. @@ -51,9 +57,20 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const { auto *GV = dyn_cast(GO); + auto IsPrefix = [](StringRef Name, StringRef Prefix) { + return Name.consume_front(Prefix) && (Name.empty() || Name[0] == '.'); + }; + // Functions/GlobalIFuncs are only large under the large code model. - if (!GV) + if (!GV) { + // Handle explicit sections as we do for GlobalVariables with an explicit + // section, see comments below. + if (GO->hasSection()) { + StringRef Name = GO->getSection(); + return IsPrefix(Name, ".ltext"); + } return getCodeModel() == CodeModel::Large; + } if (GV->isThreadLocal()) return false; @@ -73,11 +90,8 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const { // data sections. The code model attribute overrides this above. if (GV->hasSection()) { StringRef Name = GV->getSection(); - auto IsPrefix = [&](StringRef Prefix) { - StringRef S = Name; - return S.consume_front(Prefix) && (S.empty() || S[0] == '.'); - }; - return IsPrefix(".lbss") || IsPrefix(".ldata") || IsPrefix(".lrodata"); + return IsPrefix(Name, ".lbss") || IsPrefix(Name, ".ldata") || + IsPrefix(Name, ".lrodata"); } // Respect large data threshold for medium and large code models. diff --git a/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll b/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll index 016c9a4d7b8390..66a6fd37675427 100644 --- a/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll +++ b/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll @@ -13,9 +13,20 @@ ; RUN: llvm-readelf -S %t | FileCheck %s --check-prefix=LARGE-DS ; SMALL: .text {{.*}} AX {{.*}} +; SMALL: .ltext {{.*}} AXl {{.*}} +; SMALL: .ltext.2 {{.*}} AXl {{.*}} +; SMALL: .foo {{.*}} AX {{.*}} ; SMALL-DS: .text.func {{.*}} AX {{.*}} +; SMALL-DS: .ltext {{.*}} AXl {{.*}} +; SMALL-DS: .ltext.2 {{.*}} AXl {{.*}} +; SMALL-DS: .foo {{.*}} AX {{.*}} ; LARGE: .ltext {{.*}} AXl {{.*}} +; LARGE: .ltext.2 {{.*}} AXl {{.*}} +; LARGE: .foo {{.*}} AX {{.*}} ; LARGE-DS: .ltext.func {{.*}} AXl {{.*}} +; LARGE-DS: .ltext {{.*}} AXl {{.*}} +; LARGE-DS: .ltext.2 {{.*}} AXl {{.*}} +; LARGE-DS: .foo {{.*}} AX {{.*}} target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64--linux" @@ -23,3 +34,15 @@ target triple = "x86_64--linux" define void @func() { ret void } + +define void @ltext() section ".ltext" { + ret void +} + +define void @ltext2() section ".ltext.2" { + ret void +} + +define void @foo() section ".foo" { + ret void +} From 191be2a8a8531129c779bf23c4eec86f32c69bf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Mon, 8 Apr 2024 15:03:04 +0200 Subject: [PATCH 168/300] update_test_checks: pre-commit a new test The test shows that name preservation doesn't work properly when --include-generated-funcs is used. --- .../Inputs/stable_ir_values_funcs.ll | 23 ++++++++++++++++++ .../Inputs/stable_ir_values_funcs.ll.expected | 24 +++++++++++++++++++ .../stable_ir_values_funcs.test | 2 ++ 3 files changed, 49 insertions(+) create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll new file mode 100644 index 00000000000000..b4fd23a3d81ce2 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3 +; RUN: opt < %s -S | FileCheck %s + +; The assumption underlying this test is that there are pre-existing check lines +; but something has changed, and we would like to avoid needless changes of +; meta variable names so that diffs end up being easier to read, e.g. avoid +; changing X_I33 into X_I34 or renumbering the various TMP variables. + +define i32 @func({i32, i32} %x, i32 %y) { + %x.i34 = extractvalue {i32, i32} %x, 0 + %1 = add i32 %y, 1 + %2 = add i32 %x.i34, %1 + %3 = mul i32 %2, 3 + ret i32 %3 +} + +; CHECK-LABEL: define i32 @func( +; CHECK-SAME: { i32, i32 } [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[X_I33:%.*]] = extractvalue { i32, i32 } [[X]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X_I33]], [[Y]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[TMP1]], 3 +; CHECK-NEXT: ret i32 [[TMP2]] +; diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected new file mode 100644 index 00000000000000..1559319ac013a2 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3 +; RUN: opt < %s -S | FileCheck %s + +; The assumption underlying this test is that there are pre-existing check lines +; but something has changed, and we would like to avoid needless changes of +; meta variable names so that diffs end up being easier to read, e.g. avoid +; changing X_I33 into X_I34 or renumbering the various TMP variables. + +define i32 @func({i32, i32} %x, i32 %y) { + %x.i34 = extractvalue {i32, i32} %x, 0 + %1 = add i32 %y, 1 + %2 = add i32 %x.i34, %1 + %3 = mul i32 %2, 3 + ret i32 %3 +} + +; CHECK-LABEL: define i32 @func( +; CHECK-SAME: { i32, i32 } [[X:%.*]], i32 [[Y:%.*]]) { +; CHECK-NEXT: [[X_I34:%.*]] = extractvalue { i32, i32 } [[X]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[Y]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[X_I34]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 3 +; CHECK-NEXT: ret i32 [[TMP3]] +; diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test new file mode 100644 index 00000000000000..5132fb9a26ff43 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test @@ -0,0 +1,2 @@ +# RUN: cp -f %S/Inputs/stable_ir_values_funcs.ll %t.ll && %update_test_checks %t.ll +# RUN: diff -u %t.ll %S/Inputs/stable_ir_values_funcs.ll.expected From e770249d955e06f205e91017cd394d8670996168 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Wed, 3 Apr 2024 18:14:12 +0200 Subject: [PATCH 169/300] update_test_checks: add new test This test is meant to demonstrate an upcoming change that replaces basic block labels by FileCheck patterns. --- .../update_test_checks/Inputs/phi-labels.ll | 39 ++++++++++ .../Inputs/phi-labels.ll.expected | 71 +++++++++++++++++++ .../update_test_checks/phi-labels.test | 5 ++ 3 files changed, 115 insertions(+) create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll.expected create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/phi-labels.test diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll new file mode 100644 index 00000000000000..4eb05b943f5067 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll @@ -0,0 +1,39 @@ +; RUN: opt < %s -S | FileCheck %s + +define i32 @phi_after_label(i1 %cc) { +entry: + br i1 %cc, label %then, label %end + +then: + br label %end + +end: + %r = phi i32 [ 0, %entry ], [ 1, %then ] + ret i32 %r +} + +define void @phi_before_label(i32 %bound) { +entry: + br label %loop + +loop: + %ctr = phi i32 [ 0, %entry ], [ %ctr.next, %loop ] + %ctr.next = add i32 %ctr, 1 + %cc = icmp ult i32 %ctr.next, %bound + br i1 %cc, label %loop, label %end + +end: + ret void +} + +define i32 @phi_after_label_unnamed(i1 %cc) { +0: + br i1 %cc, label %1, label %2 + +1: + br label %2 + +2: + %r = phi i32 [ 0, %0 ], [ 1, %1 ] + ret i32 %r +} diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll.expected new file mode 100644 index 00000000000000..1d21ebe547f689 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll.expected @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -S | FileCheck %s + +define i32 @phi_after_label(i1 %cc) { +; CHECK-LABEL: define i32 @phi_after_label( +; CHECK-SAME: i1 [[CC:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[CC]], label [[THEN:%.*]], label [[END:%.*]] +; CHECK: then: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[THEN]] ] +; CHECK-NEXT: ret i32 [[R]] +; +entry: + br i1 %cc, label %then, label %end + +then: + br label %end + +end: + %r = phi i32 [ 0, %entry ], [ 1, %then ] + ret i32 %r +} + +define void @phi_before_label(i32 %bound) { +; CHECK-LABEL: define void @phi_before_label( +; CHECK-SAME: i32 [[BOUND:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[CTR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[CTR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[CTR_NEXT]] = add i32 [[CTR]], 1 +; CHECK-NEXT: [[CC:%.*]] = icmp ult i32 [[CTR_NEXT]], [[BOUND]] +; CHECK-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] +; CHECK: end: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %ctr = phi i32 [ 0, %entry ], [ %ctr.next, %loop ] + %ctr.next = add i32 %ctr, 1 + %cc = icmp ult i32 %ctr.next, %bound + br i1 %cc, label %loop, label %end + +end: + ret void +} + +define i32 @phi_after_label_unnamed(i1 %cc) { +; CHECK-LABEL: define i32 @phi_after_label_unnamed( +; CHECK-SAME: i1 [[CC:%.*]]) { +; CHECK-NEXT: br i1 [[CC]], label [[TMP1:%.*]], label [[TMP2:%.*]] +; CHECK: 1: +; CHECK-NEXT: br label [[TMP2]] +; CHECK: 2: +; CHECK-NEXT: [[R:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ 1, [[TMP1]] ] +; CHECK-NEXT: ret i32 [[R]] +; +0: + br i1 %cc, label %1, label %2 + +1: + br label %2 + +2: + %r = phi i32 [ 0, %0 ], [ 1, %1 ] + ret i32 %r +} diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/phi-labels.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/phi-labels.test new file mode 100644 index 00000000000000..411c84de1dcba5 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/phi-labels.test @@ -0,0 +1,5 @@ +# RUN: cp -f %S/Inputs/phi-labels.ll %t.ll && %update_test_checks --version 4 %t.ll +# RUN: diff -u %t.ll %S/Inputs/phi-labels.ll.expected +## Check that running the script again does not change the result: +# RUN: %update_test_checks %t.ll +# RUN: diff -u %t.ll %S/Inputs/phi-labels.ll.expected From 377a2767a9951659b5ec7309abb78da719a4f93b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Wed, 3 Apr 2024 18:19:59 +0200 Subject: [PATCH 170/300] update_test_checks: remove an unused function --- llvm/utils/UpdateTestChecks/common.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index ecb19d233a8d1a..eed36a0cdd73fd 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -980,10 +980,6 @@ def __init__( def is_local_def_ir_value(self): return self.ir_prefix == "%" - # Return true if this kind of IR value is "global", basically if it matches '#{{.*}}'. - def is_global_scope_ir_value_match(self, match): - return self.global_ir_rhs_regexp is not None - # Return the IR prefix and check prefix we use for this kind or IR value, # e.g., (%, TMP) for locals. If the IR prefix is a regex, return the prefix # used in the IR output From 9ec6c5d26321f5d32d97218f850ae7cafda32b2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 16 Apr 2024 13:15:56 -0700 Subject: [PATCH 171/300] [flang][cuda] Add fir.deallocate operation (#88839) Add the fir.cuda_deallocate operation that perform device deallocation of data hold by a descriptor. This will replace the call to AllocatableDeallocate from the runtime. This is a companion operation to the one added in #88586 --- .../include/flang/Optimizer/Dialect/FIROps.td | 25 +++++++++++++ flang/lib/Optimizer/Dialect/FIROps.cpp | 13 +++++++ flang/test/Fir/cuf-invalid.fir | 37 +++++++++++++++++++ flang/test/Fir/cuf.mlir | 6 +++ 4 files changed, 81 insertions(+) diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index c181c7ed62dff3..580e840587abb2 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -3222,4 +3222,29 @@ def fir_CUDAAllocateOp : fir_Op<"cuda_allocate", [AttrSizedOperandSegments, let hasVerifier = 1; } +def fir_CUDADeallocateOp : fir_Op<"cuda_deallocate", + [MemoryEffects<[MemFree]>]> { + let summary = "Perform the device deallocation of data of an allocatable"; + + let description = [{ + The fir.cuda_deallocate operation performs the deallocation on the device + of the data of an allocatable. + }]; + + let arguments = (ins Arg:$box, + Arg, "", [MemWrite]>:$errmsg, + fir_CUDADataAttributeAttr:$cuda_attr, + UnitAttr:$hasStat); + + let results = (outs AnyIntegerType:$stat); + + let assemblyFormat = [{ + $box `:` qualified(type($box)) + ( `errmsg` `(` $errmsg^ `:` type($errmsg) `)` )? + attr-dict `->` type($stat) + }]; + + let hasVerifier = 1; +} + #endif diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 88710880174d21..be27256d911b31 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -4012,6 +4012,19 @@ mlir::LogicalResult fir::CUDAAllocateOp::verify() { return mlir::success(); } +mlir::LogicalResult fir::CUDADeallocateOp::verify() { + if (!fir::unwrapRefType(getBox().getType()).isa()) + return emitOpError( + "expect box to be a reference to class or box type value"); + if (getErrmsg() && + !fir::unwrapRefType(getErrmsg().getType()).isa()) + return emitOpError( + "expect errmsg to be a reference to/or a box type value"); + if (getErrmsg() && !getHasStat()) + return emitOpError("expect stat attribute when errmsg is provided"); + return mlir::success(); +} + //===----------------------------------------------------------------------===// // FIROpsDialect //===----------------------------------------------------------------------===// diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir index 9c5ffe7176a3bd..5d3aa55cf346a4 100644 --- a/flang/test/Fir/cuf-invalid.fir +++ b/flang/test/Fir/cuf-invalid.fir @@ -48,3 +48,40 @@ func.func @_QPsub1() { %13 = fir.cuda_allocate %11 : !fir.ref> errmsg(%1 : !fir.ref) {cuda_attr = #fir.cuda, hasStat} -> i32 return } + +// ----- + +func.func @_QPsub1() { + %1 = fir.alloca i32 + // expected-error@+1{{'fir.cuda_deallocate' op expect box to be a reference to class or box type value}} + %2 = fir.cuda_deallocate %1 : !fir.ref {cuda_attr = #fir.cuda} -> i32 + return +} + +// ----- + +func.func @_QPsub1() { + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} + %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %1 = fir.alloca i32 + %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> + // expected-error@+1{{'fir.cuda_deallocate' op expect errmsg to be a reference to/or a box type value}} + %13 = fir.cuda_deallocate %11 : !fir.ref> errmsg(%1 : !fir.ref) {cuda_attr = #fir.cuda, hasStat} -> i32 + return +} + +// ----- + +func.func @_QPsub1() { + %0 = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} + %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %c100 = arith.constant 100 : index + %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"} + %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) + %9 = fir.embox %8#1 : (!fir.ref>) -> !fir.box> + %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> + %16 = fir.convert %9 : (!fir.box>) -> !fir.box + // expected-error@+1{{'fir.cuda_deallocate' op expect stat attribute when errmsg is provided}} + %13 = fir.cuda_deallocate %11 : !fir.ref> errmsg(%16 : !fir.box) {cuda_attr = #fir.cuda} -> i32 + return +} diff --git a/flang/test/Fir/cuf.mlir b/flang/test/Fir/cuf.mlir index 67eff31b35b2b8..71f0652067facf 100644 --- a/flang/test/Fir/cuf.mlir +++ b/flang/test/Fir/cuf.mlir @@ -7,10 +7,12 @@ func.func @_QPsub1() { %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> %13 = fir.cuda_allocate %11 : !fir.ref> {cuda_attr = #fir.cuda} -> i32 + %14 = fir.cuda_deallocate %11 : !fir.ref> {cuda_attr = #fir.cuda} -> i32 return } // CHECK: fir.cuda_allocate %{{.*}} : !fir.ref> {cuda_attr = #fir.cuda} -> i32 +// CHECK: fir.cuda_deallocate %{{.*}} : !fir.ref> {cuda_attr = #fir.cuda} -> i32 // ----- @@ -66,5 +68,9 @@ func.func @_QPsub1() { %11 = fir.convert %4#1 : (!fir.ref>>>) -> !fir.ref> %16 = fir.convert %9 : (!fir.box>) -> !fir.box %13 = fir.cuda_allocate %11 : !fir.ref> errmsg(%16 : !fir.box) {cuda_attr = #fir.cuda, hasStat} -> i32 + %14 = fir.cuda_deallocate %11 : !fir.ref> errmsg(%16 : !fir.box) {cuda_attr = #fir.cuda, hasStat} -> i32 return } + +// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref> errmsg(%{{.*}} : !fir.box) {cuda_attr = #fir.cuda, hasStat} -> i32 +// CHECK: fir.cuda_deallocate %{{.*}} : !fir.ref> errmsg(%{{.*}} : !fir.box) {cuda_attr = #fir.cuda, hasStat} -> i32 From 34777c238b117b52dd41a9d12e8b54fb83677a12 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 16 Apr 2024 21:24:24 +0100 Subject: [PATCH 172/300] [VPlan] Don't mark VPBlendRecipe as phi-like. VPBlendRecipes don't get lowered to phis and usually do not appear at the beginning of blocks, due to their masks appearing before them. This effectively relaxes an over-eager verifier message. Fixes https://github.com/llvm/llvm-project/issues/88297. Fixes https://github.com/llvm/llvm-project/issues/88804. --- llvm/lib/Transforms/Vectorize/VPlanValue.h | 4 +- .../LoopVectorize/blend-in-header.ll | 233 ++++++++++++++++++ 2 files changed, 235 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/blend-in-header.ll diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index da3a768552fc5e..3f8d4f4fe7d647 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -361,8 +361,8 @@ class VPDef { VPWidenMemoryInstructionSC, VPWidenSC, VPWidenSelectSC, - // START: Phi-like recipes. Need to be kept together. VPBlendSC, + // START: Phi-like recipes. Need to be kept together. VPWidenPHISC, VPPredInstPHISC, // START: SubclassID for recipes that inherit VPHeaderPHIRecipe. @@ -376,7 +376,7 @@ class VPDef { VPReductionPHISC, // END: SubclassID for recipes that inherit VPHeaderPHIRecipe // END: Phi-like recipes - VPFirstPHISC = VPBlendSC, + VPFirstPHISC = VPWidenPHISC, VPFirstHeaderPHISC = VPCanonicalIVPHISC, VPLastHeaderPHISC = VPReductionPHISC, VPLastPHISC = VPReductionPHISC, diff --git a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll new file mode 100644 index 00000000000000..01e223a3243796 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll @@ -0,0 +1,233 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2" + +; Test with blend recipe in header VPBB, from +; https://github.com/llvm/llvm-project/issues/88297. +define i64 @pr88297() { +; CHECK-LABEL: define i64 @pr88297() { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: br i1 false, label [[LOOP_LATCH]], label [[THEN:%.*]] +; CHECK: then: +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[R:%.*]] = phi i64 [ 1, [[THEN]] ], [ 0, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ 1, [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[R_LCSSA]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 false, label %loop.latch, label %then + +then: + br label %loop.latch + +loop.latch: + %r = phi i64 [ 1, %then ], [ 0, %loop.header ] + %iv.next = add i32 %iv, 1 + %icmp = icmp sgt i32 %iv, 1000 + br i1 %icmp, label %exit, label %loop.header + +exit: + %r.lcssa = phi i64 [ %r, %loop.latch ] + ret i64 %r.lcssa +} + +define i64 @pr88297_incoming_ops_reordered() { +; CHECK-LABEL: define i64 @pr88297_incoming_ops_reordered() { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: br i1 false, label [[LOOP_LATCH]], label [[THEN:%.*]] +; CHECK: then: +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[R:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ 1, [[THEN]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ 1, [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[R_LCSSA]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 false, label %loop.latch, label %then + +then: + br label %loop.latch + +loop.latch: + %r = phi i64 [ 0, %loop.header ], [ 1, %then ] + %iv.next = add i32 %iv, 1 + %icmp = icmp sgt i32 %iv, 1000 + br i1 %icmp, label %exit, label %loop.header + +exit: + %r.lcssa = phi i64 [ %r, %loop.latch ] + ret i64 %r.lcssa +} + +define i64 @invar_cond(i1 %c) { +; CHECK-LABEL: define i64 @invar_cond( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3 +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]] +; CHECK: then: +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[R:%.*]] = phi i64 [ 1, [[THEN]] ], [ 0, [[LOOP_HEADER]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[R_LCSSA]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %c, label %loop.latch, label %then + +then: + br label %loop.latch + +loop.latch: + %r = phi i64 [ 1, %then ], [ 0, %loop.header ] + %iv.next = add i32 %iv, 1 + %icmp = icmp sgt i32 %iv, 1000 + br i1 %icmp, label %exit, label %loop.header + +exit: + %r.lcssa = phi i64 [ %r, %loop.latch ] + ret i64 %r.lcssa +} + +define i64 @invar_cond_incoming_ops_reordered(i1 %c) { +; CHECK-LABEL: define i64 @invar_cond_incoming_ops_reordered( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> , <4 x i64> zeroinitializer +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3 +; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]] +; CHECK: then: +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[R:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ 1, [[THEN]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000 +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[R_LCSSA]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %c, label %loop.latch, label %then + +then: + br label %loop.latch + +loop.latch: + %r = phi i64 [ 0, %loop.header ], [ 1, %then ] + %iv.next = add i32 %iv, 1 + %icmp = icmp sgt i32 %iv, 1000 + br i1 %icmp, label %exit, label %loop.header + +exit: + %r.lcssa = phi i64 [ %r, %loop.latch ] + ret i64 %r.lcssa +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +;. From b6bd41db31c798f3fc82368381fad6d42795f512 Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Thu, 21 Mar 2024 11:01:21 -0500 Subject: [PATCH 173/300] [InstCombine] Add canonicalization of `sitofp` -> `uitofp nneg` This is essentially the same as #82404 but has the `nneg` flag which allows the backend to reliably undo the transform. Closes #88299 --- clang/test/Headers/__clang_hip_math.hip | 24 +++---- .../InstCombine/InstCombineCasts.cpp | 18 ++++- .../test/Transforms/InstCombine/add-sitofp.ll | 20 +++--- .../Transforms/InstCombine/binop-itofp.ll | 66 +++++++++---------- .../Transforms/InstCombine/clamp-to-minmax.ll | 10 +-- llvm/test/Transforms/InstCombine/fpcast.ll | 24 +++---- .../Transforms/InstCombine/minmax-fold.ll | 10 +-- llvm/test/Transforms/InstCombine/minmax-fp.ll | 2 +- llvm/test/Transforms/InstCombine/pr27236.ll | 2 +- llvm/test/Transforms/InstCombine/sitofp.ll | 2 +- .../LoopVectorize/X86/float-induction-x86.ll | 6 +- .../LoopVectorize/float-induction.ll | 56 ++++++++-------- 12 files changed, 127 insertions(+), 113 deletions(-) diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip index 2e5f521a5feaed..1271868a53b866 100644 --- a/clang/test/Headers/__clang_hip_math.hip +++ b/clang/test/Headers/__clang_hip_math.hip @@ -1685,7 +1685,7 @@ extern "C" __device__ double test_j1(double x) { // DEFAULT-NEXT: [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // DEFAULT-NEXT: [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // DEFAULT-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// DEFAULT-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float +// DEFAULT-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float // DEFAULT-NEXT: [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]] // DEFAULT-NEXT: [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]] // DEFAULT-NEXT: [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]] @@ -1718,7 +1718,7 @@ extern "C" __device__ double test_j1(double x) { // FINITEONLY-NEXT: [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // FINITEONLY-NEXT: [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // FINITEONLY-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// FINITEONLY-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float +// FINITEONLY-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float // FINITEONLY-NEXT: [[DIV_I:%.*]] = fdiv nnan ninf contract float [[CONV_I]], [[Y]] // FINITEONLY-NEXT: [[MUL8_I:%.*]] = fmul nnan ninf contract float [[__X1_0_I3]], [[DIV_I]] // FINITEONLY-NEXT: [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_0_I2]] @@ -1751,7 +1751,7 @@ extern "C" __device__ double test_j1(double x) { // APPROX-NEXT: [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // APPROX-NEXT: [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // APPROX-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// APPROX-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float +// APPROX-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float // APPROX-NEXT: [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]] // APPROX-NEXT: [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]] // APPROX-NEXT: [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]] @@ -1788,7 +1788,7 @@ extern "C" __device__ float test_jnf(int x, float y) { // DEFAULT-NEXT: [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // DEFAULT-NEXT: [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // DEFAULT-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// DEFAULT-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double +// DEFAULT-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double // DEFAULT-NEXT: [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]] // DEFAULT-NEXT: [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]] // DEFAULT-NEXT: [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]] @@ -1821,7 +1821,7 @@ extern "C" __device__ float test_jnf(int x, float y) { // FINITEONLY-NEXT: [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // FINITEONLY-NEXT: [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // FINITEONLY-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// FINITEONLY-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double +// FINITEONLY-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double // FINITEONLY-NEXT: [[DIV_I:%.*]] = fdiv nnan ninf contract double [[CONV_I]], [[Y]] // FINITEONLY-NEXT: [[MUL8_I:%.*]] = fmul nnan ninf contract double [[__X1_0_I3]], [[DIV_I]] // FINITEONLY-NEXT: [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_0_I2]] @@ -1854,7 +1854,7 @@ extern "C" __device__ float test_jnf(int x, float y) { // APPROX-NEXT: [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // APPROX-NEXT: [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // APPROX-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// APPROX-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double +// APPROX-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double // APPROX-NEXT: [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]] // APPROX-NEXT: [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]] // APPROX-NEXT: [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]] @@ -4222,7 +4222,7 @@ extern "C" __device__ double test_y1(double x) { // DEFAULT-NEXT: [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // DEFAULT-NEXT: [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // DEFAULT-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// DEFAULT-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float +// DEFAULT-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float // DEFAULT-NEXT: [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]] // DEFAULT-NEXT: [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]] // DEFAULT-NEXT: [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]] @@ -4255,7 +4255,7 @@ extern "C" __device__ double test_y1(double x) { // FINITEONLY-NEXT: [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // FINITEONLY-NEXT: [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // FINITEONLY-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// FINITEONLY-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float +// FINITEONLY-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float // FINITEONLY-NEXT: [[DIV_I:%.*]] = fdiv nnan ninf contract float [[CONV_I]], [[Y]] // FINITEONLY-NEXT: [[MUL8_I:%.*]] = fmul nnan ninf contract float [[__X1_0_I3]], [[DIV_I]] // FINITEONLY-NEXT: [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_0_I2]] @@ -4288,7 +4288,7 @@ extern "C" __device__ double test_y1(double x) { // APPROX-NEXT: [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // APPROX-NEXT: [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // APPROX-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// APPROX-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float +// APPROX-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float // APPROX-NEXT: [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]] // APPROX-NEXT: [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]] // APPROX-NEXT: [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]] @@ -4325,7 +4325,7 @@ extern "C" __device__ float test_ynf(int x, float y) { // DEFAULT-NEXT: [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // DEFAULT-NEXT: [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // DEFAULT-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// DEFAULT-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double +// DEFAULT-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double // DEFAULT-NEXT: [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]] // DEFAULT-NEXT: [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]] // DEFAULT-NEXT: [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]] @@ -4358,7 +4358,7 @@ extern "C" __device__ float test_ynf(int x, float y) { // FINITEONLY-NEXT: [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // FINITEONLY-NEXT: [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // FINITEONLY-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// FINITEONLY-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double +// FINITEONLY-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double // FINITEONLY-NEXT: [[DIV_I:%.*]] = fdiv nnan ninf contract double [[CONV_I]], [[Y]] // FINITEONLY-NEXT: [[MUL8_I:%.*]] = fmul nnan ninf contract double [[__X1_0_I3]], [[DIV_I]] // FINITEONLY-NEXT: [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_0_I2]] @@ -4391,7 +4391,7 @@ extern "C" __device__ float test_ynf(int x, float y) { // APPROX-NEXT: [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ] // APPROX-NEXT: [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ] // APPROX-NEXT: [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1 -// APPROX-NEXT: [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double +// APPROX-NEXT: [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double // APPROX-NEXT: [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]] // APPROX-NEXT: [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]] // APPROX-NEXT: [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]] diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 437e9b92c7032f..d242d3f443def9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1977,11 +1977,25 @@ Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) { } Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) { - return commonCastTransforms(CI); + if (Instruction *R = commonCastTransforms(CI)) + return R; + if (!CI.hasNonNeg() && isKnownNonNegative(CI.getOperand(0), SQ)) { + CI.setNonNeg(); + return &CI; + } + return nullptr; } Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) { - return commonCastTransforms(CI); + if (Instruction *R = commonCastTransforms(CI)) + return R; + if (isKnownNonNegative(CI.getOperand(0), SQ)) { + auto UI = + CastInst::Create(Instruction::UIToFP, CI.getOperand(0), CI.getType()); + UI->setNonNeg(true); + return UI; + } + return nullptr; } Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) { diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll index 2bdc808d9771c4..f1afcaf5f85d2a 100644 --- a/llvm/test/Transforms/InstCombine/add-sitofp.ll +++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll @@ -6,7 +6,7 @@ define double @x(i32 %a, i32 %b) { ; CHECK-NEXT: [[M:%.*]] = lshr i32 [[A:%.*]], 24 ; CHECK-NEXT: [[N:%.*]] = and i32 [[M]], [[B:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[N]], 1 -; CHECK-NEXT: [[P:%.*]] = uitofp i32 [[TMP1]] to double +; CHECK-NEXT: [[P:%.*]] = uitofp nneg i32 [[TMP1]] to double ; CHECK-NEXT: ret double [[P]] ; %m = lshr i32 %a, 24 @@ -20,7 +20,7 @@ define double @test(i32 %a) { ; CHECK-LABEL: @test( ; CHECK-NEXT: [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823 ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[A_AND]], 1 -; CHECK-NEXT: [[RES:%.*]] = uitofp i32 [[TMP1]] to double +; CHECK-NEXT: [[RES:%.*]] = uitofp nneg i32 [[TMP1]] to double ; CHECK-NEXT: ret double [[RES]] ; ; Drop two highest bits to guarantee that %a + 1 doesn't overflow @@ -33,7 +33,7 @@ define double @test(i32 %a) { define float @test_neg(i32 %a) { ; CHECK-LABEL: @test_neg( ; CHECK-NEXT: [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823 -; CHECK-NEXT: [[A_AND_FP:%.*]] = sitofp i32 [[A_AND]] to float +; CHECK-NEXT: [[A_AND_FP:%.*]] = uitofp nneg i32 [[A_AND]] to float ; CHECK-NEXT: [[RES:%.*]] = fadd float [[A_AND_FP]], 1.000000e+00 ; CHECK-NEXT: ret float [[RES]] ; @@ -49,7 +49,7 @@ define double @test_2(i32 %a, i32 %b) { ; CHECK-NEXT: [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823 ; CHECK-NEXT: [[B_AND:%.*]] = and i32 [[B:%.*]], 1073741823 ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[A_AND]], [[B_AND]] -; CHECK-NEXT: [[RES:%.*]] = uitofp i32 [[TMP1]] to double +; CHECK-NEXT: [[RES:%.*]] = uitofp nneg i32 [[TMP1]] to double ; CHECK-NEXT: ret double [[RES]] ; ; Drop two highest bits to guarantee that %a + %b doesn't overflow @@ -67,8 +67,8 @@ define float @test_2_neg(i32 %a, i32 %b) { ; CHECK-LABEL: @test_2_neg( ; CHECK-NEXT: [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823 ; CHECK-NEXT: [[B_AND:%.*]] = and i32 [[B:%.*]], 1073741823 -; CHECK-NEXT: [[A_AND_FP:%.*]] = sitofp i32 [[A_AND]] to float -; CHECK-NEXT: [[B_AND_FP:%.*]] = sitofp i32 [[B_AND]] to float +; CHECK-NEXT: [[A_AND_FP:%.*]] = uitofp nneg i32 [[A_AND]] to float +; CHECK-NEXT: [[B_AND_FP:%.*]] = uitofp nneg i32 [[B_AND]] to float ; CHECK-NEXT: [[RES:%.*]] = fadd float [[A_AND_FP]], [[B_AND_FP]] ; CHECK-NEXT: ret float [[RES]] ; @@ -89,7 +89,7 @@ define float @test_3(i32 %a, i32 %b) { ; CHECK-NEXT: [[M:%.*]] = lshr i32 [[A:%.*]], 24 ; CHECK-NEXT: [[N:%.*]] = and i32 [[M]], [[B:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[N]], 1 -; CHECK-NEXT: [[P:%.*]] = uitofp i32 [[TMP1]] to float +; CHECK-NEXT: [[P:%.*]] = uitofp nneg i32 [[TMP1]] to float ; CHECK-NEXT: ret float [[P]] ; %m = lshr i32 %a, 24 @@ -104,7 +104,7 @@ define <4 x double> @test_4(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], ; CHECK-NEXT: [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i32> [[A_AND]], [[B_AND]] -; CHECK-NEXT: [[RES:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double> +; CHECK-NEXT: [[RES:%.*]] = uitofp nneg <4 x i32> [[TMP1]] to <4 x double> ; CHECK-NEXT: ret <4 x double> [[RES]] ; ; Drop two highest bits to guarantee that %a + %b doesn't overflow @@ -122,8 +122,8 @@ define <4 x float> @test_4_neg(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: @test_4_neg( ; CHECK-NEXT: [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], ; CHECK-NEXT: [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], -; CHECK-NEXT: [[A_AND_FP:%.*]] = sitofp <4 x i32> [[A_AND]] to <4 x float> -; CHECK-NEXT: [[B_AND_FP:%.*]] = sitofp <4 x i32> [[B_AND]] to <4 x float> +; CHECK-NEXT: [[A_AND_FP:%.*]] = uitofp nneg <4 x i32> [[A_AND]] to <4 x float> +; CHECK-NEXT: [[B_AND_FP:%.*]] = uitofp nneg <4 x i32> [[B_AND]] to <4 x float> ; CHECK-NEXT: [[RES:%.*]] = fadd <4 x float> [[A_AND_FP]], [[B_AND_FP]] ; CHECK-NEXT: ret <4 x float> [[RES]] ; diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll index d72a54e8babc9f..097a8196af80f8 100644 --- a/llvm/test/Transforms/InstCombine/binop-itofp.ll +++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll @@ -21,7 +21,7 @@ define half @test_ui_ui_i8_add_fail_overflow(i8 noundef %x_in, i8 noundef %y_in) ; CHECK-LABEL: @test_ui_ui_i8_add_fail_overflow( ; CHECK-NEXT: [[X:%.*]] = and i8 [[X_IN:%.*]], 127 ; CHECK-NEXT: [[Y:%.*]] = and i8 [[Y_IN:%.*]], -127 -; CHECK-NEXT: [[XF:%.*]] = uitofp i8 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i8 [[X]] to half ; CHECK-NEXT: [[YF:%.*]] = uitofp i8 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fadd half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] @@ -49,7 +49,7 @@ define half @test_ui_ui_i8_add_C(i8 noundef %x_in) { define half @test_ui_ui_i8_add_C_fail_no_repr(i8 noundef %x_in) { ; CHECK-LABEL: @test_ui_ui_i8_add_C_fail_no_repr( ; CHECK-NEXT: [[X:%.*]] = and i8 [[X_IN:%.*]], 127 -; CHECK-NEXT: [[XF:%.*]] = uitofp i8 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i8 [[X]] to half ; CHECK-NEXT: [[R:%.*]] = fadd half [[XF]], 0xH57F8 ; CHECK-NEXT: ret half [[R]] ; @@ -62,7 +62,7 @@ define half @test_ui_ui_i8_add_C_fail_no_repr(i8 noundef %x_in) { define half @test_ui_ui_i8_add_C_fail_overflow(i8 noundef %x_in) { ; CHECK-LABEL: @test_ui_ui_i8_add_C_fail_overflow( ; CHECK-NEXT: [[X:%.*]] = and i8 [[X_IN:%.*]], 127 -; CHECK-NEXT: [[XF:%.*]] = uitofp i8 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i8 [[X]] to half ; CHECK-NEXT: [[R:%.*]] = fadd half [[XF]], 0xH5808 ; CHECK-NEXT: ret half [[R]] ; @@ -110,7 +110,7 @@ define half @test_ui_si_i8_add(i8 noundef %x_in, i8 noundef %y_in) { ; CHECK-NEXT: [[X:%.*]] = and i8 [[X_IN:%.*]], 63 ; CHECK-NEXT: [[Y:%.*]] = and i8 [[Y_IN:%.*]], 63 ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i8 [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = uitofp i8 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %x = and i8 %x_in, 63 @@ -140,7 +140,7 @@ define half @test_ui_si_i8_add_overflow(i8 noundef %x_in, i8 noundef %y_in) { define half @test_ui_ui_i8_sub_C(i8 noundef %x_in) { ; CHECK-LABEL: @test_ui_ui_i8_sub_C( ; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X_IN:%.*]], 127 -; CHECK-NEXT: [[R:%.*]] = uitofp i8 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %x = or i8 %x_in, 128 @@ -166,7 +166,7 @@ define half @test_si_si_i8_sub(i8 noundef %x_in, i8 noundef %y_in) { ; CHECK-NEXT: [[X:%.*]] = and i8 [[X_IN:%.*]], 63 ; CHECK-NEXT: [[Y:%.*]] = or i8 [[Y_IN:%.*]], -64 ; CHECK-NEXT: [[TMP1:%.*]] = sub nsw i8 [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = sitofp i8 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %x = and i8 %x_in, 63 @@ -181,7 +181,7 @@ define half @test_si_si_i8_sub_fail_overflow(i8 noundef %x_in, i8 noundef %y_in) ; CHECK-LABEL: @test_si_si_i8_sub_fail_overflow( ; CHECK-NEXT: [[X:%.*]] = and i8 [[X_IN:%.*]], 63 ; CHECK-NEXT: [[Y:%.*]] = or i8 [[Y_IN:%.*]], -65 -; CHECK-NEXT: [[XF:%.*]] = sitofp i8 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i8 [[X]] to half ; CHECK-NEXT: [[YF:%.*]] = sitofp i8 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fsub half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] @@ -198,7 +198,7 @@ define half @test_si_si_i8_sub_C(i8 noundef %x_in) { ; CHECK-LABEL: @test_si_si_i8_sub_C( ; CHECK-NEXT: [[X:%.*]] = and i8 [[X_IN:%.*]], 63 ; CHECK-NEXT: [[TMP1:%.*]] = or disjoint i8 [[X]], 64 -; CHECK-NEXT: [[R:%.*]] = sitofp i8 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %x = and i8 %x_in, 63 @@ -283,7 +283,7 @@ define half @test_ui_ui_i8_mul_C(i8 noundef %x_in) { define half @test_ui_ui_i8_mul_C_fail_overlow(i8 noundef %x_in) { ; CHECK-LABEL: @test_ui_ui_i8_mul_C_fail_overlow( ; CHECK-NEXT: [[X:%.*]] = and i8 [[X_IN:%.*]], 14 -; CHECK-NEXT: [[XF:%.*]] = uitofp i8 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i8 [[X]] to half ; CHECK-NEXT: [[R:%.*]] = fmul half [[XF]], 0xH4CC0 ; CHECK-NEXT: ret half [[R]] ; @@ -315,7 +315,7 @@ define half @test_si_si_i8_mul_fail_maybe_zero(i8 noundef %x_in, i8 noundef %y_i ; CHECK-LABEL: @test_si_si_i8_mul_fail_maybe_zero( ; CHECK-NEXT: [[X:%.*]] = and i8 [[X_IN:%.*]], 7 ; CHECK-NEXT: [[Y:%.*]] = or i8 [[Y_IN:%.*]], -8 -; CHECK-NEXT: [[XF:%.*]] = sitofp i8 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i8 [[X]] to half ; CHECK-NEXT: [[YF:%.*]] = sitofp i8 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fmul half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] @@ -332,7 +332,7 @@ define half @test_si_si_i8_mul_C_fail_no_repr(i8 noundef %x_in) { ; CHECK-LABEL: @test_si_si_i8_mul_C_fail_no_repr( ; CHECK-NEXT: [[XX:%.*]] = and i8 [[X_IN:%.*]], 6 ; CHECK-NEXT: [[X:%.*]] = or disjoint i8 [[XX]], 1 -; CHECK-NEXT: [[XF:%.*]] = sitofp i8 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i8 [[X]] to half ; CHECK-NEXT: [[R:%.*]] = fmul half [[XF]], 0xHC780 ; CHECK-NEXT: ret half [[R]] ; @@ -347,7 +347,7 @@ define half @test_si_si_i8_mul_C_fail_overflow(i8 noundef %x_in) { ; CHECK-LABEL: @test_si_si_i8_mul_C_fail_overflow( ; CHECK-NEXT: [[XX:%.*]] = and i8 [[X_IN:%.*]], 6 ; CHECK-NEXT: [[X:%.*]] = or disjoint i8 [[XX]], 1 -; CHECK-NEXT: [[XF:%.*]] = sitofp i8 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i8 [[X]] to half ; CHECK-NEXT: [[R:%.*]] = fmul half [[XF]], 0xHCCC0 ; CHECK-NEXT: ret half [[R]] ; @@ -365,7 +365,7 @@ define half @test_ui_si_i8_mul(i8 noundef %x_in, i8 noundef %y_in) { ; CHECK-NEXT: [[YY:%.*]] = and i8 [[Y_IN:%.*]], 7 ; CHECK-NEXT: [[Y:%.*]] = add nuw nsw i8 [[YY]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i8 [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = uitofp i8 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %xx = and i8 %x_in, 6 @@ -384,7 +384,7 @@ define half @test_ui_si_i8_mul_fail_maybe_zero(i8 noundef %x_in, i8 noundef %y_i ; CHECK-NEXT: [[X:%.*]] = add nuw nsw i8 [[XX]], 1 ; CHECK-NEXT: [[Y:%.*]] = and i8 [[Y_IN:%.*]], 7 ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i8 [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = uitofp i8 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %xx = and i8 %x_in, 7 @@ -401,7 +401,7 @@ define half @test_ui_si_i8_mul_fail_signed(i8 noundef %x_in, i8 noundef %y_in) { ; CHECK-NEXT: [[XX:%.*]] = and i8 [[X_IN:%.*]], 7 ; CHECK-NEXT: [[X:%.*]] = add nuw nsw i8 [[XX]], 1 ; CHECK-NEXT: [[Y:%.*]] = or i8 [[Y_IN:%.*]], -4 -; CHECK-NEXT: [[XF:%.*]] = sitofp i8 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i8 [[X]] to half ; CHECK-NEXT: [[YF:%.*]] = uitofp i8 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fmul half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] @@ -420,7 +420,7 @@ define half @test_ui_ui_i16_add(i16 noundef %x_in, i16 noundef %y_in) { ; CHECK-NEXT: [[X:%.*]] = and i16 [[X_IN:%.*]], 2047 ; CHECK-NEXT: [[Y:%.*]] = and i16 [[Y_IN:%.*]], 2047 ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i16 [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = uitofp i16 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i16 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %x = and i16 %x_in, 2047 @@ -435,8 +435,8 @@ define half @test_ui_ui_i16_add_fail_not_promotable(i16 noundef %x_in, i16 nound ; CHECK-LABEL: @test_ui_ui_i16_add_fail_not_promotable( ; CHECK-NEXT: [[X:%.*]] = and i16 [[X_IN:%.*]], 2049 ; CHECK-NEXT: [[Y:%.*]] = and i16 [[Y_IN:%.*]], 2047 -; CHECK-NEXT: [[XF:%.*]] = uitofp i16 [[X]] to half -; CHECK-NEXT: [[YF:%.*]] = uitofp i16 [[Y]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i16 [[X]] to half +; CHECK-NEXT: [[YF:%.*]] = uitofp nneg i16 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fadd half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] ; @@ -463,7 +463,7 @@ define half @test_ui_ui_i16_add_C(i16 noundef %x_in) { define half @test_ui_ui_i16_add_C_fail_overflow(i16 noundef %x_in) { ; CHECK-LABEL: @test_ui_ui_i16_add_C_fail_overflow( ; CHECK-NEXT: [[X:%.*]] = and i16 [[X_IN:%.*]], 2047 -; CHECK-NEXT: [[XF:%.*]] = uitofp i16 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i16 [[X]] to half ; CHECK-NEXT: [[R:%.*]] = fadd half [[XF]], 0xH7BD0 ; CHECK-NEXT: ret half [[R]] ; @@ -541,7 +541,7 @@ define half @test_si_si_i16_sub_fail_no_promotion(i16 noundef %x_in, i16 noundef ; CHECK-LABEL: @test_si_si_i16_sub_fail_no_promotion( ; CHECK-NEXT: [[X:%.*]] = and i16 [[X_IN:%.*]], 2047 ; CHECK-NEXT: [[Y:%.*]] = or i16 [[Y_IN:%.*]], -2049 -; CHECK-NEXT: [[XF:%.*]] = sitofp i16 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i16 [[X]] to half ; CHECK-NEXT: [[YF:%.*]] = sitofp i16 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fsub half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] @@ -575,7 +575,7 @@ define half @test_ui_si_i16_sub_fail_maybe_signed(i16 noundef %x_in, i16 noundef ; CHECK-NEXT: [[X:%.*]] = or i16 [[X_IN:%.*]], -2048 ; CHECK-NEXT: [[Y:%.*]] = and i16 [[Y_IN:%.*]], 2047 ; CHECK-NEXT: [[XF:%.*]] = uitofp i16 [[X]] to half -; CHECK-NEXT: [[YF:%.*]] = sitofp i16 [[Y]] to half +; CHECK-NEXT: [[YF:%.*]] = uitofp nneg i16 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fsub half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] ; @@ -607,8 +607,8 @@ define half @test_ui_ui_i16_mul_fail_no_promotion(i16 noundef %x_in, i16 noundef ; CHECK-LABEL: @test_ui_ui_i16_mul_fail_no_promotion( ; CHECK-NEXT: [[X:%.*]] = and i16 [[X_IN:%.*]], 4095 ; CHECK-NEXT: [[Y:%.*]] = and i16 [[Y_IN:%.*]], 3 -; CHECK-NEXT: [[XF:%.*]] = uitofp i16 [[X]] to half -; CHECK-NEXT: [[YF:%.*]] = uitofp i16 [[Y]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i16 [[X]] to half +; CHECK-NEXT: [[YF:%.*]] = uitofp nneg i16 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fmul half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] ; @@ -643,7 +643,7 @@ define half @test_si_si_i16_mul_fail_overflow(i16 noundef %x_in, i16 noundef %y_ ; CHECK-NEXT: [[XX:%.*]] = and i16 [[X_IN:%.*]], 126 ; CHECK-NEXT: [[X:%.*]] = or disjoint i16 [[XX]], 1 ; CHECK-NEXT: [[Y:%.*]] = or i16 [[Y_IN:%.*]], -257 -; CHECK-NEXT: [[XF:%.*]] = sitofp i16 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i16 [[X]] to half ; CHECK-NEXT: [[YF:%.*]] = sitofp i16 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fmul half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] @@ -690,7 +690,7 @@ define half @test_ui_si_i16_mul(i16 noundef %x_in, i16 noundef %y_in) { ; CHECK-NEXT: [[YY:%.*]] = and i16 [[Y_IN:%.*]], 126 ; CHECK-NEXT: [[Y:%.*]] = or disjoint i16 [[YY]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i16 [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = uitofp i16 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i16 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %xx = and i16 %x_in, 126 @@ -723,7 +723,7 @@ define half @test_ui_ui_i12_add_fail_overflow(i12 noundef %x_in, i12 noundef %y_ ; CHECK-LABEL: @test_ui_ui_i12_add_fail_overflow( ; CHECK-NEXT: [[X:%.*]] = and i12 [[X_IN:%.*]], 2047 ; CHECK-NEXT: [[Y:%.*]] = and i12 [[Y_IN:%.*]], -2047 -; CHECK-NEXT: [[XF:%.*]] = uitofp i12 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i12 [[X]] to half ; CHECK-NEXT: [[YF:%.*]] = uitofp i12 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fadd half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] @@ -821,7 +821,7 @@ define half @test_si_si_i12_sub(i12 noundef %x_in, i12 noundef %y_in) { ; CHECK-NEXT: [[X:%.*]] = and i12 [[X_IN:%.*]], 1023 ; CHECK-NEXT: [[Y:%.*]] = or i12 [[Y_IN:%.*]], -1024 ; CHECK-NEXT: [[TMP1:%.*]] = sub nsw i12 [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = sitofp i12 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i12 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %x = and i12 %x_in, 1023 @@ -850,7 +850,7 @@ define half @test_ui_ui_i12_mul(i12 noundef %x_in, i12 noundef %y_in) { ; CHECK-NEXT: [[X:%.*]] = and i12 [[X_IN:%.*]], 31 ; CHECK-NEXT: [[Y:%.*]] = and i12 [[Y_IN:%.*]], 63 ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i12 [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = uitofp i12 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i12 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %x = and i12 %x_in, 31 @@ -883,7 +883,7 @@ define half @test_ui_ui_i12_mul_C(i12 noundef %x_in) { ; CHECK-LABEL: @test_ui_ui_i12_mul_C( ; CHECK-NEXT: [[X:%.*]] = shl i12 [[X_IN:%.*]], 6 ; CHECK-NEXT: [[TMP1:%.*]] = and i12 [[X]], 1984 -; CHECK-NEXT: [[R:%.*]] = uitofp i12 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i12 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %x = and i12 %x_in, 31 @@ -915,7 +915,7 @@ define half @test_si_si_i12_mul_fail_overflow(i12 noundef %x_in, i12 noundef %y_ ; CHECK-NEXT: [[XX:%.*]] = and i12 [[X_IN:%.*]], 30 ; CHECK-NEXT: [[X:%.*]] = or disjoint i12 [[XX]], 1 ; CHECK-NEXT: [[Y:%.*]] = or i12 [[Y_IN:%.*]], -128 -; CHECK-NEXT: [[XF:%.*]] = sitofp i12 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i12 [[X]] to half ; CHECK-NEXT: [[YF:%.*]] = sitofp i12 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fmul half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] @@ -933,7 +933,7 @@ define half @test_si_si_i12_mul_fail_maybe_non_zero(i12 noundef %x_in, i12 nound ; CHECK-LABEL: @test_si_si_i12_mul_fail_maybe_non_zero( ; CHECK-NEXT: [[X:%.*]] = and i12 [[X_IN:%.*]], 30 ; CHECK-NEXT: [[Y:%.*]] = or i12 [[Y_IN:%.*]], -128 -; CHECK-NEXT: [[XF:%.*]] = sitofp i12 [[X]] to half +; CHECK-NEXT: [[XF:%.*]] = uitofp nneg i12 [[X]] to half ; CHECK-NEXT: [[YF:%.*]] = sitofp i12 [[Y]] to half ; CHECK-NEXT: [[R:%.*]] = fmul half [[XF]], [[YF]] ; CHECK-NEXT: ret half [[R]] @@ -950,7 +950,7 @@ define half @test_si_si_i12_mul_C(i12 noundef %x_in) { ; CHECK-LABEL: @test_si_si_i12_mul_C( ; CHECK-NEXT: [[X:%.*]] = or i12 [[X_IN:%.*]], -64 ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i12 [[X]], -16 -; CHECK-NEXT: [[R:%.*]] = sitofp i12 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i12 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %x = or i12 %x_in, -64 @@ -979,7 +979,7 @@ define half @test_ui_si_i12_mul_nsw(i12 noundef %x_in, i12 noundef %y_in) { ; CHECK-NEXT: [[YY:%.*]] = and i12 [[Y_IN:%.*]], 30 ; CHECK-NEXT: [[Y:%.*]] = or disjoint i12 [[YY]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i12 [[X]], [[Y]] -; CHECK-NEXT: [[R:%.*]] = uitofp i12 [[TMP1]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i12 [[TMP1]] to half ; CHECK-NEXT: ret half [[R]] ; %xx = and i12 %x_in, 31 diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll index 9da9eb36d381f0..1dd0b17e9f46dd 100644 --- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll +++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll @@ -472,7 +472,7 @@ define float @ui32_clamp_and_cast_to_float(i32 %x) { ; CHECK-LABEL: @ui32_clamp_and_cast_to_float( ; CHECK-NEXT: [[LO_CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[MIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 255) -; CHECK-NEXT: [[MIN:%.*]] = uitofp i32 [[MIN1]] to float +; CHECK-NEXT: [[MIN:%.*]] = uitofp nneg i32 [[MIN1]] to float ; CHECK-NEXT: [[R:%.*]] = select i1 [[LO_CMP]], float 1.000000e+00, float [[MIN]] ; CHECK-NEXT: ret float [[R]] ; @@ -488,7 +488,7 @@ define float @ui64_clamp_and_cast_to_float(i64 %x) { ; CHECK-LABEL: @ui64_clamp_and_cast_to_float( ; CHECK-NEXT: [[LO_CMP:%.*]] = icmp eq i64 [[X:%.*]], 0 ; CHECK-NEXT: [[MIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[X]], i64 255) -; CHECK-NEXT: [[MIN:%.*]] = uitofp i64 [[MIN1]] to float +; CHECK-NEXT: [[MIN:%.*]] = uitofp nneg i64 [[MIN1]] to float ; CHECK-NEXT: [[R:%.*]] = select i1 [[LO_CMP]], float 1.000000e+00, float [[MIN]] ; CHECK-NEXT: ret float [[R]] ; @@ -504,7 +504,7 @@ define float @mixed_clamp_to_float_1(i32 %x) { ; CHECK-LABEL: @mixed_clamp_to_float_1( ; CHECK-NEXT: [[SI_MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 255) ; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[SI_MIN]], i32 1) -; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[R1]] to float +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[R1]] to float ; CHECK-NEXT: ret float [[R]] ; %si_min_cmp = icmp sgt i32 %x, 255 @@ -539,7 +539,7 @@ define float @mixed_clamp_to_float_2(i32 %x) { ; CHECK-LABEL: @mixed_clamp_to_float_2( ; CHECK-NEXT: [[SI_MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 255) ; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[SI_MIN]], i32 1) -; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[R1]] to float +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[R1]] to float ; CHECK-NEXT: ret float [[R]] ; %si_min_cmp = icmp sgt i32 %x, 255 @@ -572,7 +572,7 @@ define <2 x float> @mixed_clamp_to_float_vec(<2 x i32> %x) { ; CHECK-LABEL: @mixed_clamp_to_float_vec( ; CHECK-NEXT: [[SI_MIN:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[X:%.*]], <2 x i32> ) ; CHECK-NEXT: [[R1:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[SI_MIN]], <2 x i32> ) -; CHECK-NEXT: [[R:%.*]] = sitofp <2 x i32> [[R1]] to <2 x float> +; CHECK-NEXT: [[R:%.*]] = uitofp nneg <2 x i32> [[R1]] to <2 x float> ; CHECK-NEXT: ret <2 x float> [[R]] ; %si_min_cmp = icmp sgt <2 x i32> %x, diff --git a/llvm/test/Transforms/InstCombine/fpcast.ll b/llvm/test/Transforms/InstCombine/fpcast.ll index ac4b88fcddd7ec..d2c932ba447e4e 100644 --- a/llvm/test/Transforms/InstCombine/fpcast.ll +++ b/llvm/test/Transforms/InstCombine/fpcast.ll @@ -170,7 +170,7 @@ define half @sint_to_fptrunc(i32 %x) { define half @masked_sint_to_fptrunc1(i32 %x) { ; CHECK-LABEL: @masked_sint_to_fptrunc1( ; CHECK-NEXT: [[M:%.*]] = and i32 [[X:%.*]], 16777215 -; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[M]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[M]] to half ; CHECK-NEXT: ret half [[R]] ; %m = and i32 %x, 16777215 @@ -182,7 +182,7 @@ define half @masked_sint_to_fptrunc1(i32 %x) { define half @masked_sint_to_fptrunc2(i32 %x) { ; CHECK-LABEL: @masked_sint_to_fptrunc2( ; CHECK-NEXT: [[M:%.*]] = lshr i32 [[X:%.*]], 8 -; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[M]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[M]] to half ; CHECK-NEXT: ret half [[R]] ; %m = lshr i32 %x, 8 @@ -194,7 +194,7 @@ define half @masked_sint_to_fptrunc2(i32 %x) { define half @masked_sint_to_fptrunc3(i32 %x) { ; CHECK-LABEL: @masked_sint_to_fptrunc3( ; CHECK-NEXT: [[M:%.*]] = lshr i32 [[X:%.*]], 7 -; CHECK-NEXT: [[F:%.*]] = sitofp i32 [[M]] to float +; CHECK-NEXT: [[F:%.*]] = uitofp nneg i32 [[M]] to float ; CHECK-NEXT: [[R:%.*]] = fptrunc float [[F]] to half ; CHECK-NEXT: ret half [[R]] ; @@ -218,7 +218,7 @@ define double @sint_to_fpext(i32 %x) { define double @masked_sint_to_fpext1(i32 %x) { ; CHECK-LABEL: @masked_sint_to_fpext1( ; CHECK-NEXT: [[M:%.*]] = and i32 [[X:%.*]], 16777215 -; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[M]] to double +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[M]] to double ; CHECK-NEXT: ret double [[R]] ; %m = and i32 %x, 16777215 @@ -230,7 +230,7 @@ define double @masked_sint_to_fpext1(i32 %x) { define double @masked_sint_to_fpext2(i32 %x) { ; CHECK-LABEL: @masked_sint_to_fpext2( ; CHECK-NEXT: [[M:%.*]] = lshr i32 [[X:%.*]], 8 -; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[M]] to double +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[M]] to double ; CHECK-NEXT: ret double [[R]] ; %m = lshr i32 %x, 8 @@ -242,7 +242,7 @@ define double @masked_sint_to_fpext2(i32 %x) { define double @masked_sint_to_fpext3(i32 %x) { ; CHECK-LABEL: @masked_sint_to_fpext3( ; CHECK-NEXT: [[M:%.*]] = lshr i32 [[X:%.*]], 7 -; CHECK-NEXT: [[F:%.*]] = sitofp i32 [[M]] to float +; CHECK-NEXT: [[F:%.*]] = uitofp nneg i32 [[M]] to float ; CHECK-NEXT: [[R:%.*]] = fpext float [[F]] to double ; CHECK-NEXT: ret double [[R]] ; @@ -266,7 +266,7 @@ define half @uint_to_fptrunc(i32 %x) { define half @masked_uint_to_fptrunc1(i32 %x) { ; CHECK-LABEL: @masked_uint_to_fptrunc1( ; CHECK-NEXT: [[M:%.*]] = and i32 [[X:%.*]], 16777215 -; CHECK-NEXT: [[R:%.*]] = uitofp i32 [[M]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[M]] to half ; CHECK-NEXT: ret half [[R]] ; %m = and i32 %x, 16777215 @@ -278,7 +278,7 @@ define half @masked_uint_to_fptrunc1(i32 %x) { define half @masked_uint_to_fptrunc2(i32 %x) { ; CHECK-LABEL: @masked_uint_to_fptrunc2( ; CHECK-NEXT: [[M:%.*]] = lshr i32 [[X:%.*]], 8 -; CHECK-NEXT: [[R:%.*]] = uitofp i32 [[M]] to half +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[M]] to half ; CHECK-NEXT: ret half [[R]] ; %m = lshr i32 %x, 8 @@ -290,7 +290,7 @@ define half @masked_uint_to_fptrunc2(i32 %x) { define half @masked_uint_to_fptrunc3(i32 %x) { ; CHECK-LABEL: @masked_uint_to_fptrunc3( ; CHECK-NEXT: [[M:%.*]] = lshr i32 [[X:%.*]], 7 -; CHECK-NEXT: [[F:%.*]] = uitofp i32 [[M]] to float +; CHECK-NEXT: [[F:%.*]] = uitofp nneg i32 [[M]] to float ; CHECK-NEXT: [[R:%.*]] = fptrunc float [[F]] to half ; CHECK-NEXT: ret half [[R]] ; @@ -314,7 +314,7 @@ define double @uint_to_fpext(i32 %x) { define double @masked_uint_to_fpext1(i32 %x) { ; CHECK-LABEL: @masked_uint_to_fpext1( ; CHECK-NEXT: [[M:%.*]] = and i32 [[X:%.*]], 16777215 -; CHECK-NEXT: [[R:%.*]] = uitofp i32 [[M]] to double +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[M]] to double ; CHECK-NEXT: ret double [[R]] ; %m = and i32 %x, 16777215 @@ -326,7 +326,7 @@ define double @masked_uint_to_fpext1(i32 %x) { define double @masked_uint_to_fpext2(i32 %x) { ; CHECK-LABEL: @masked_uint_to_fpext2( ; CHECK-NEXT: [[M:%.*]] = lshr i32 [[X:%.*]], 8 -; CHECK-NEXT: [[R:%.*]] = uitofp i32 [[M]] to double +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[M]] to double ; CHECK-NEXT: ret double [[R]] ; %m = lshr i32 %x, 8 @@ -338,7 +338,7 @@ define double @masked_uint_to_fpext2(i32 %x) { define double @masked_uint_to_fpext3(i32 %x) { ; CHECK-LABEL: @masked_uint_to_fpext3( ; CHECK-NEXT: [[M:%.*]] = lshr i32 [[X:%.*]], 7 -; CHECK-NEXT: [[F:%.*]] = uitofp i32 [[M]] to float +; CHECK-NEXT: [[F:%.*]] = uitofp nneg i32 [[M]] to float ; CHECK-NEXT: [[R:%.*]] = fpext float [[F]] to double ; CHECK-NEXT: ret double [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/minmax-fold.ll b/llvm/test/Transforms/InstCombine/minmax-fold.ll index 8391fe33eb9b59..bbbbf9eb6eafe4 100644 --- a/llvm/test/Transforms/InstCombine/minmax-fold.ll +++ b/llvm/test/Transforms/InstCombine/minmax-fold.ll @@ -131,7 +131,7 @@ define i64 @t9(i32 %a) { define float @t10(i32 %x) { ; CHECK-LABEL: @t10( ; CHECK-NEXT: [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 255) -; CHECK-NEXT: [[R:%.*]] = sitofp i32 [[R1]] to float +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i32 [[R1]] to float ; CHECK-NEXT: ret float [[R]] ; %f_x = sitofp i32 %x to float @@ -143,7 +143,7 @@ define float @t10(i32 %x) { define float @t11(i64 %x) { ; CHECK-LABEL: @t11( ; CHECK-NEXT: [[R1:%.*]] = call i64 @llvm.smax.i64(i64 [[X:%.*]], i64 255) -; CHECK-NEXT: [[R:%.*]] = sitofp i64 [[R1]] to float +; CHECK-NEXT: [[R:%.*]] = uitofp nneg i64 [[R1]] to float ; CHECK-NEXT: ret float [[R]] ; %f_x = sitofp i64 %x to float @@ -526,7 +526,7 @@ falselabel: define double @PR31751_umin1(i32 %x) { ; CHECK-LABEL: @PR31751_umin1( ; CHECK-NEXT: [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647) -; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[SEL]] to double +; CHECK-NEXT: [[CONV:%.*]] = uitofp nneg i32 [[SEL]] to double ; CHECK-NEXT: ret double [[CONV]] ; %cmp = icmp slt i32 %x, 0 @@ -538,7 +538,7 @@ define double @PR31751_umin1(i32 %x) { define double @PR31751_umin2(i32 %x) { ; CHECK-LABEL: @PR31751_umin2( ; CHECK-NEXT: [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647) -; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[SEL]] to double +; CHECK-NEXT: [[CONV:%.*]] = uitofp nneg i32 [[SEL]] to double ; CHECK-NEXT: ret double [[CONV]] ; %cmp = icmp ult i32 %x, 2147483647 @@ -550,7 +550,7 @@ define double @PR31751_umin2(i32 %x) { define double @PR31751_umin3(i32 %x) { ; CHECK-LABEL: @PR31751_umin3( ; CHECK-NEXT: [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647) -; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[SEL]] to double +; CHECK-NEXT: [[CONV:%.*]] = uitofp nneg i32 [[SEL]] to double ; CHECK-NEXT: ret double [[CONV]] ; %cmp = icmp ugt i32 %x, 2147483647 diff --git a/llvm/test/Transforms/InstCombine/minmax-fp.ll b/llvm/test/Transforms/InstCombine/minmax-fp.ll index f89e8a18e63440..b9e46caa63753a 100644 --- a/llvm/test/Transforms/InstCombine/minmax-fp.ll +++ b/llvm/test/Transforms/InstCombine/minmax-fp.ll @@ -257,7 +257,7 @@ define double @t16(i32 %x) { define double @t17(i32 %x) { ; CHECK-LABEL: @t17( ; CHECK-NEXT: [[SEL1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 2) -; CHECK-NEXT: [[SEL:%.*]] = sitofp i32 [[SEL1]] to double +; CHECK-NEXT: [[SEL:%.*]] = uitofp nneg i32 [[SEL1]] to double ; CHECK-NEXT: ret double [[SEL]] ; %cmp = icmp sgt i32 %x, 2 diff --git a/llvm/test/Transforms/InstCombine/pr27236.ll b/llvm/test/Transforms/InstCombine/pr27236.ll index 61ea344b1bdbd4..67c320d3524664 100644 --- a/llvm/test/Transforms/InstCombine/pr27236.ll +++ b/llvm/test/Transforms/InstCombine/pr27236.ll @@ -4,7 +4,7 @@ define float @test1(i32 %scale) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[SCALE:%.*]], i32 1) -; CHECK-NEXT: [[TMP2:%.*]] = sitofp i32 [[TMP1]] to float +; CHECK-NEXT: [[TMP2:%.*]] = uitofp nneg i32 [[TMP1]] to float ; CHECK-NEXT: ret float [[TMP2]] ; %1 = icmp sgt i32 1, %scale diff --git a/llvm/test/Transforms/InstCombine/sitofp.ll b/llvm/test/Transforms/InstCombine/sitofp.ll index cc6b6425eb03c8..51eff39cd900e2 100644 --- a/llvm/test/Transforms/InstCombine/sitofp.ll +++ b/llvm/test/Transforms/InstCombine/sitofp.ll @@ -256,7 +256,7 @@ define i25 @consider_lowbits_masked_input(i25 %A) { define i32 @overflow_masked_input(i32 %A) { ; CHECK-LABEL: @overflow_masked_input( ; CHECK-NEXT: [[M:%.*]] = and i32 [[A:%.*]], 16777217 -; CHECK-NEXT: [[B:%.*]] = uitofp i32 [[M]] to float +; CHECK-NEXT: [[B:%.*]] = uitofp nneg i32 [[M]] to float ; CHECK-NEXT: [[C:%.*]] = fptoui float [[B]] to i32 ; CHECK-NEXT: ret i32 [[C]] ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index c55e732c901475..59b8ce42380d9d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -25,7 +25,7 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 { ; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[ZEXT]], 2147483616 -; AUTO_VEC-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; AUTO_VEC-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; AUTO_VEC-NEXT: [[TMP0:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 ; AUTO_VEC-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP0]], 1.000000e+00 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] @@ -201,7 +201,7 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 -; AUTO_VEC-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to double +; AUTO_VEC-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to double ; AUTO_VEC-NEXT: [[TMP0:%.*]] = fmul fast double [[DOTCAST]], 3.000000e+00 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: @@ -366,7 +366,7 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) { ; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; AUTO_VEC: vector.ph: ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967264 -; AUTO_VEC-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; AUTO_VEC-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; AUTO_VEC-NEXT: [[TMP1:%.*]] = fmul reassoc float [[DOTCAST]], 4.200000e+01 ; AUTO_VEC-NEXT: [[IND_END:%.*]] = fadd reassoc float [[TMP1]], 1.000000e+00 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index caea114e3d4487..bd658c31768a84 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -29,7 +29,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: ; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644 -; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] ; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 @@ -84,7 +84,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640 -; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 @@ -142,7 +142,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: ; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646 -; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] ; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -193,7 +193,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646 -; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 @@ -276,7 +276,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: ; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644 -; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] ; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 @@ -331,7 +331,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640 -; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 @@ -389,7 +389,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: ; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646 -; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] ; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -442,7 +442,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646 -; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 @@ -526,7 +526,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: ; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644 -; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 ; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 @@ -574,7 +574,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640 -; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 @@ -625,7 +625,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: ; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646 -; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 ; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -675,7 +675,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646 -; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 ; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 @@ -758,10 +758,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: ; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483644 -; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL1-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01 ; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 -; VEC4_INTERL1-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL1-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] ; VEC4_INTERL1-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] ; VEC4_INTERL1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 @@ -835,10 +835,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483640 -; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01 ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 -; VEC4_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] ; VEC4_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 @@ -922,10 +922,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: ; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483646 -; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01 ; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 -; VEC1_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] ; VEC1_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1000,10 +1000,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483646 -; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01 ; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 -; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]] ; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0 @@ -1113,7 +1113,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) { ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: ; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644 -; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 ; VEC4_INTERL1-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1158,7 +1158,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) { ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640 -; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1206,7 +1206,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) { ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: ; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646 -; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 ; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1256,7 +1256,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) { ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646 -; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01 ; VEC2_INTERL1_PRED_STORE-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; VEC2_INTERL1_PRED_STORE-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1319,7 +1319,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) { ; VEC4_INTERL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL1: vector.ph: ; VEC4_INTERL1-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804 -; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL1-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL1: vector.body: ; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] @@ -1396,7 +1396,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) { ; VEC4_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775800 -; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE17:%.*]] ] @@ -1512,7 +1512,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) { ; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VEC1_INTERL2: vector.ph: ; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806 -; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC1_INTERL2: vector.body: ; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] @@ -1570,7 +1570,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) { ; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.ph: ; VEC2_INTERL1_PRED_STORE-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806 -; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC2_INTERL1_PRED_STORE-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC2_INTERL1_PRED_STORE-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC2_INTERL1_PRED_STORE: vector.body: ; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] From 885b8d9bb5192267cb2449a9ddec28e20ac9300e Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 16 Apr 2024 13:16:17 -0700 Subject: [PATCH 174/300] [RISCV] Enable mul strength reduction for XTheadBa This vendor extension has the same shift_add as zba, and most of the same patterns are duplicated. Enable it here too so the configurations don't diverge. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- llvm/test/CodeGen/RISCV/rv64xtheadba.ll | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index dc7c6f83b98579..7b4bec2f65b741 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13408,7 +13408,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG, if (VT != Subtarget.getXLenVT()) return SDValue(); - if (!Subtarget.hasStdExtZba()) + if (!Subtarget.hasStdExtZba() && !Subtarget.hasVendorXTHeadBa()) return SDValue(); ConstantSDNode *CNode = dyn_cast(N->getOperand(1)); diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll index 6f56babf28f5ec..1450c86c76d05f 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll @@ -268,6 +268,23 @@ define i64 @mul96(i64 %a) { ret i64 %c } +define i64 @mul137(i64 %a) { +; RV64I-LABEL: mul137: +; RV64I: # %bb.0: +; RV64I-NEXT: li a1, 137 +; RV64I-NEXT: mul a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64XTHEADBA-LABEL: mul137: +; RV64XTHEADBA: # %bb.0: +; RV64XTHEADBA-NEXT: th.addsl a1, a0, a0, 3 +; RV64XTHEADBA-NEXT: slli a0, a0, 7 +; RV64XTHEADBA-NEXT: add a0, a0, a1 +; RV64XTHEADBA-NEXT: ret + %c = mul i64 %a, 137 + ret i64 %c +} + define i64 @mul160(i64 %a) { ; RV64I-LABEL: mul160: ; RV64I: # %bb.0: From c6e01627acf8591830ee1d211cff4d5388095f3d Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Tue, 16 Apr 2024 20:35:35 +0000 Subject: [PATCH 175/300] Revert "Reapply "[LV] Improve AnyOf reduction codegen. (#78304)"" This reverts commit c6e38b928c56f562aea68a8e90f02dbdf0eada85. Causes miscompiles, see comments on #78304. --- .../include/llvm/Transforms/Utils/LoopUtils.h | 9 + llvm/lib/Transforms/Utils/LoopUtils.cpp | 24 +- .../Vectorize/LoopVectorizationPlanner.h | 4 +- .../Transforms/Vectorize/LoopVectorize.cpp | 76 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 13 +- .../LoopVectorize/AArch64/sve-select-cmp.ll | 88 +-- .../RISCV/select-cmp-reduction.ll | 742 +++++++++++++----- .../epilog-vectorization-any-of-reductions.ll | 149 +--- .../LoopVectorize/select-cmp-predicated.ll | 210 +++-- .../Transforms/LoopVectorize/select-cmp.ll | 146 ++-- ...tion-start-value-may-be-undef-or-poison.ll | 43 +- 11 files changed, 904 insertions(+), 600 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 345e09dce0b2b1..187ace3a0cbedf 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -372,6 +372,15 @@ RecurKind getMinMaxReductionRecurKind(Intrinsic::ID RdxID); /// Returns the comparison predicate used when expanding a min/max reduction. CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK); +/// See RecurrenceDescriptor::isAnyOfPattern for a description of the pattern we +/// are trying to match. In this pattern, we are only ever selecting between two +/// values: 1) an initial start value \p StartVal of the reduction PHI, and 2) a +/// loop invariant value. If any of lane value in \p Left, \p Right is not equal +/// to \p StartVal, select the loop invariant value. This is done by selecting +/// \p Right iff \p Left is equal to \p StartVal. +Value *createAnyOfOp(IRBuilderBase &Builder, Value *StartVal, RecurKind RK, + Value *Left, Value *Right); + /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind. /// The Builder's fast-math-flags must be set to propagate the expected values. Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 9d816c5220532c..73c5d636782294 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1034,6 +1034,15 @@ CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) { } } +Value *llvm::createAnyOfOp(IRBuilderBase &Builder, Value *StartVal, + RecurKind RK, Value *Left, Value *Right) { + if (auto VTy = dyn_cast(Left->getType())) + StartVal = Builder.CreateVectorSplat(VTy->getElementCount(), StartVal); + Value *Cmp = + Builder.CreateCmp(CmpInst::ICMP_NE, Left, StartVal, "rdx.select.cmp"); + return Builder.CreateSelect(Cmp, Left, Right, "rdx.select"); +} + Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right) { Type *Ty = Left->getType(); @@ -1142,13 +1151,16 @@ Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src, NewVal = SI->getTrueValue(); } + // Create a splat vector with the new value and compare this to the vector + // we want to reduce. + ElementCount EC = cast(Src->getType())->getElementCount(); + Value *Right = Builder.CreateVectorSplat(EC, InitVal); + Value *Cmp = + Builder.CreateCmp(CmpInst::ICMP_NE, Src, Right, "rdx.select.cmp"); + // If any predicate is true it means that we want to select the new value. - Value *AnyOf = - Src->getType()->isVectorTy() ? Builder.CreateOrReduce(Src) : Src; - // The compares in the loop may yield poison, which propagates through the - // bitwise ORs. Freeze it here before the condition is used. - AnyOf = Builder.CreateFreeze(AnyOf); - return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select"); + Cmp = Builder.CreateOrReduce(Cmp); + return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select"); } Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index ece2a34f180cb4..ebca2d855a4676 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -68,7 +68,9 @@ class VPBuilder { public: VPBuilder() = default; VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); } - VPBuilder(VPRecipeBase *InsertPt) { setInsertPoint(InsertPt); } + VPBuilder(VPRecipeBase *InsertPt) { + setInsertPoint(InsertPt->getParent(), InsertPt->getIterator()); + } /// Clear the insertion point: created instructions will not be inserted into /// a block. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2057cab46135ff..44885a95bd1020 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3051,8 +3051,9 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( } // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", - LoopScalarPreHeader->getFirstNonPHI()); + PHINode *BCResumeVal = + PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()->getIterator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); @@ -7450,6 +7451,7 @@ static void createAndCollectMergePhiForReduction( auto *PhiR = cast(RedResult->getOperand(0)); const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + TrackingVH ReductionStartValue = RdxDesc.getRecurrenceStartValue(); Value *FinalValue = State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane())); auto *ResumePhi = @@ -7474,7 +7476,7 @@ static void createAndCollectMergePhiForReduction( BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), Incoming); else - BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming); + BCBlockPhi->addIncoming(ReductionStartValue, Incoming); } auto *OrigPhi = cast(PhiR->getUnderlyingValue()); @@ -7767,10 +7769,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( // Now, compare the remaining count and if there aren't enough iterations to // execute the vectorized epilogue skip to the scalar part. - LoopVectorPreHeader->setName("vec.epilog.ph"); - BasicBlock *VecEpilogueIterationCountCheck = - SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI, - nullptr, "vec.epilog.iter.check", true); + BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; + VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check"); + LoopVectorPreHeader = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + LI, nullptr, "vec.epilog.ph"); emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader, VecEpilogueIterationCountCheck); @@ -8893,10 +8896,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // A ComputeReductionResult recipe is added to the middle block, also for // in-loop reductions which compute their result in-loop, because generating // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. -// -// Adjust AnyOf reductions; replace the reduction phi for the selected value -// with a boolean reduction phi node to check if the condition is true in any -// iteration. The final value is selected by the final ComputeReductionResult. void LoopVectorizationPlanner::adjustRecipesForReductions( VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { @@ -9071,41 +9070,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( continue; const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - // Adjust AnyOf reductions; replace the reduction phi for the selected value - // with a boolean reduction phi node to check if the condition is true in - // any iteration. The final value is selected by the final - // ComputeReductionResult. - if (RecurrenceDescriptor::isAnyOfRecurrenceKind( - RdxDesc.getRecurrenceKind())) { - auto *Select = cast(*find_if(PhiR->users(), [](VPUser *U) { - return isa(U) || - (isa(U) && - cast(U)->getUnderlyingInstr()->getOpcode() == - Instruction::Select); - })); - VPValue *Cmp = Select->getOperand(0); - // If the compare is checking the reduction PHI node, adjust it to check - // the start value. - if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) { - for (unsigned I = 0; I != CmpR->getNumOperands(); ++I) - if (CmpR->getOperand(I) == PhiR) - CmpR->setOperand(I, PhiR->getStartValue()); - } - VPBuilder::InsertPointGuard Guard(Builder); - Builder.setInsertPoint(Select); - - // If the true value of the select is the reduction phi, the new value is - // selected if the negated condition is true in any iteration. - if (Select->getOperand(1) == PhiR) - Cmp = Builder.createNot(Cmp); - VPValue *Or = Builder.createOr(PhiR, Cmp); - Select->getVPSingleValue()->replaceAllUsesWith(Or); - - // Convert the reduction phi to operate on bools. - PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse( - OrigLoop->getHeader()->getContext()))); - } - // If tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the beginning of the // dedicated latch block. @@ -9138,9 +9102,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); - if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() && - !RecurrenceDescriptor::isAnyOfRecurrenceKind( - RdxDesc.getRecurrenceKind())) { + if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!"); Type *RdxTy = RdxDesc.getRecurrenceType(); auto *Trunc = @@ -10181,19 +10143,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { Value *ResumeV = nullptr; // TODO: Move setting of resume values to prepareToExecute. if (auto *ReductionPhi = dyn_cast(&R)) { - const RecurrenceDescriptor &RdxDesc = - ReductionPhi->getRecurrenceDescriptor(); - RecurKind RK = RdxDesc.getRecurrenceKind(); - ResumeV = ReductionResumeValues.find(&RdxDesc)->second; - if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { - // VPReductionPHIRecipes for AnyOf reductions expect a boolean as - // start value; compare the final value from the main vector loop - // to the start value. - IRBuilder<> Builder( - cast(ResumeV)->getParent()->getFirstNonPHI()); - ResumeV = Builder.CreateICmpNE(ResumeV, - RdxDesc.getRecurrenceStartValue()); - } + ResumeV = ReductionResumeValues + .find(&ReductionPhi->getRecurrenceDescriptor()) + ->second; } else { // Create induction resume values for both widened pointer and // integer/fp inductions and update the start value of the induction diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 625319954e9b7b..9f242a1bee8f6c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -501,8 +501,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { // Reduce all of the unrolled parts into a single vector. Value *ReducedPartRdx = RdxParts[0]; unsigned Op = RecurrenceDescriptor::getOpcode(RK); - if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) - Op = Instruction::Or; if (PhiR->isOrdered()) { ReducedPartRdx = RdxParts[State.UF - 1]; @@ -515,16 +513,19 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { if (Op != Instruction::ICmp && Op != Instruction::FCmp) ReducedPartRdx = Builder.CreateBinOp( (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - else + else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) { + TrackingVH ReductionStartValue = + RdxDesc.getRecurrenceStartValue(); + ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK, + ReducedPartRdx, RdxPart); + } else ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); } } // Create the reduction after the loop. Note that inloop reductions create // the target reduction in the loop using a Reduction recipe. - if ((State.VF.isVector() || - RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) && - !PhiR->isInLoop()) { + if (State.VF.isVector() && !PhiR->isInLoop()) { ReducedPartRdx = createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi); // If the reduction can be performed in a smaller type, we need to extend diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll index 2470bca1e17b99..1c26ee8479e578 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll @@ -8,41 +8,39 @@ target triple = "aarch64-linux-gnu" define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { ; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp ; CHECK-VF4IC1: vector.body: -; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; CHECK-VF4IC1-NEXT: [[NOT:%*]] = xor [[VEC_ICMP]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or [[VEC_PHI]], [[NOT]] +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select [[VEC_ICMP]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 7, i64 0), poison, zeroinitializer) ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[VEC_SEL]]) -; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR]], i32 7, i32 3 +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 ; CHECK-VF4IC4-LABEL: @select_const_i32_from_icmp ; CHECK-VF4IC4: vector.body: -; CHECK-VF4IC4: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] -; CHECK-VF4IC4: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] -; CHECK-VF4IC4: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] -; CHECK-VF4IC4: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI1:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI2:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI3:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI4:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] ; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) ; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) ; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) ; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq {{.*}}, shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; CHECK-VF4IC4-NEXT: [[NOT1:%.*]] = xor [[VEC_ICMP1]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-VF4IC4-NEXT: [[NOT2:%.*]] = xor [[VEC_ICMP2]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-VF4IC4-NEXT: [[NOT3:%.*]] = xor [[VEC_ICMP3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-VF4IC4-NEXT: [[NOT4:%.*]] = xor [[VEC_ICMP4]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-VF4IC4-NEXT: [[VEC_SEL1:%.*]] = or [[VEC_PHI1]], [[NOT1]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL2:%.*]] = or [[VEC_PHI2]], [[NOT2]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL3:%.*]] = or [[VEC_PHI3]], [[NOT3]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL4:%.*]] = or [[VEC_PHI4]], [[NOT4]] +; CHECK-VF4IC4-NEXT: [[VEC_SEL1]] = select [[VEC_ICMP1]], [[VEC_PHI1]], shufflevector ( insertelement ( poison, i32 7, i64 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL2]] = select [[VEC_ICMP2]], [[VEC_PHI2]], shufflevector ( insertelement ( poison, i32 7, i64 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL3]] = select [[VEC_ICMP3]], [[VEC_PHI3]], shufflevector ( insertelement ( poison, i32 7, i64 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL4]] = select [[VEC_ICMP4]], [[VEC_PHI4]], shufflevector ( insertelement ( poison, i32 7, i64 0), poison, zeroinitializer) ; CHECK-VF4IC4: middle.block: -; CHECK-VF4IC4-NEXT: [[OR1:%.*]] = or [[VEC_SEL2]], [[VEC_SEL1]] -; CHECK-VF4IC4-NEXT: [[OR2:%.*]] = or [[VEC_SEL3]], [[OR1]] -; CHECK-VF4IC4-NEXT: [[OR3:%.*]] = or [[VEC_SEL4]], [[OR2]] -; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[OR3]]) -; CHECK-VF4IC4-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[FR]], i32 7, i32 3 +; CHECK-VF4IC4-NEXT: [[VEC_ICMP5:%.*]] = icmp ne [[VEC_SEL1]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL5:%.*]] = select [[VEC_ICMP5]], [[VEC_SEL1]], [[VEC_SEL2]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP6:%.*]] = icmp ne [[VEC_SEL5]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL6:%.*]] = select [[VEC_ICMP6]], [[VEC_SEL5]], [[VEC_SEL3]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP7:%.*]] = icmp ne [[VEC_SEL6]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[VEC_SEL7:%.*]] = select [[VEC_ICMP7]], [[VEC_SEL6]], [[VEC_SEL4]] +; CHECK-VF4IC4-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL7]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 entry: br label %for.body @@ -64,18 +62,21 @@ exit: ; preds = %for.body define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 { ; CHECK-VF4IC1-LABEL: @select_i32_from_icmp ; CHECK-VF4IC1: vector.ph: -; CHECK-VF4IC1-NOT: shufflevector -; CHECK-VF4IC1-NOT: shufflevector +; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement poison, i32 %a, i64 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector [[TMP1]], poison, zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement poison, i32 %b, i64 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector [[TMP2]], poison, zeroinitializer ; CHECK-VF4IC1: vector.body: -; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; CHECK-VF4IC1-NEXT: [[NOT:%*]] = xor [[VEC_ICMP]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or [[VEC_PHI]], [[NOT]] +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select [[VEC_ICMP]], [[VEC_PHI]], [[SPLAT_OF_B]] ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[VEC_SEL]]) -; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR]], i32 %b, i32 %a +; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement poison, i32 %a, i64 0 +; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector [[FIN_INS]], poison, zeroinitializer +; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne [[VEC_SEL]], [[FIN_SPLAT]] +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_CMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a ; CHECK-VF4IC4-LABEL: @select_i32_from_icmp ; CHECK-VF4IC4: vector.body: @@ -100,15 +101,14 @@ exit: ; preds = %for.body define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 { ; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp ; CHECK-VF4IC1: vector.body: -; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = fcmp fast ueq [[VEC_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-VF4IC1-NEXT: [[NOT:%*]] = xor [[VEC_ICMP]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or [[VEC_PHI]], [[NOT]] +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select [[VEC_ICMP]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[VEC_SEL]]) -; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR]], i32 1, i32 2 +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2 ; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp ; CHECK-VF4IC4: vector.body: @@ -156,17 +156,17 @@ exit: ; preds = %for.body define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) #0 { ; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp ; CHECK-VF4IC1: vector.body: -; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load ; CHECK-VF4IC1: [[MASK:%.*]] = icmp sgt [[VEC_LOAD]], shufflevector ( insertelement ( poison, i32 35, i64 0), poison, zeroinitializer) ; CHECK-VF4IC1: [[MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr {{%.*}}, i32 4, [[MASK]], poison) ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) -; CHECK-VF4IC1-NEXT: [[VEC_SEL_TMP:%.*]] = or [[VEC_PHI]], [[VEC_ICMP]] -; CHECK-VF4IC1: [[VEC_SEL:%.*]] = select [[MASK]], [[VEC_SEL_TMP]], [[VEC_PHI]] +; CHECK-VF4IC1-NEXT: [[VEC_SEL_TMP:%.*]] = select [[VEC_ICMP]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer), [[VEC_PHI]] +; CHECK-VF4IC1: [[VEC_SEL:%.*]] = select [[MASK]], [[VEC_SEL_TMP]], [[VEC_PHI]] ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[VEC_SEL]]) -; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR]], i32 1, i32 0 +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne [[VEC_SEL]], zeroinitializer +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 0 ; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp ; CHECK-VF4IC4: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll index 2b58acbfe9cc98..8a2dc0abb0de8e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S \ ; RUN: < %s | FileCheck %s ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 \ @@ -6,59 +7,109 @@ target triple = "riscv64" define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 { -; CHECK-LABEL: @select_icmp +; CHECK-LABEL: define i32 @select_icmp( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[NOT:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[TMP7]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0 +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[Y]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP7]], [[X]] +; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[COND_LCSSA]] ; -; SCALABLE-LABEL: @select_icmp +; SCALABLE-LABEL: define i32 @select_icmp( +; SCALABLE-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X:%.*]], i64 0 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[X]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP4]] -; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 -; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp slt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; SCALABLE-NEXT: [[NOT:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[NOT]] -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp slt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP10]] = select [[TMP9]], [[VEC_PHI]], [[BROADCAST_SPLAT2]] +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP9]]) -; SCALABLE-NEXT: [[FR:%.*]] = freeze i1 [[TMP13]] -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0 +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP10]], zeroinitializer +; SCALABLE-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 [[Y]], i32 0 +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; SCALABLE-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; SCALABLE-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP13]], [[X]] +; SCALABLE-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]] +; SCALABLE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; SCALABLE: for.end: +; SCALABLE-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[COND_LCSSA]] ; entry: br label %for.body @@ -79,59 +130,109 @@ for.end: } define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 { -; CHECK-LABEL: @select_fcmp +; CHECK-LABEL: define i32 @select_fcmp( +; CHECK-SAME: float [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[NOT:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[TMP7]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0 +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[Y]], i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast olt float [[TMP7]], [[X]] +; CHECK-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[COND_LCSSA]] ; -; SCALABLE-LABEL: @select_fcmp +; SCALABLE-LABEL: define i32 @select_fcmp( +; SCALABLE-SAME: float [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[X:%.*]], i64 0 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[X]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP4]] -; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 -; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = fcmp fast olt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; SCALABLE-NEXT: [[NOT:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[NOT]] -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] -; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 +; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[TMP9:%.*]] = fcmp fast olt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP10]] = select [[TMP9]], [[VEC_PHI]], [[BROADCAST_SPLAT2]] +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP9]]) -; SCALABLE-NEXT: [[FR:%.*]] = freeze i1 [[TMP13]] -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0 +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP10]], zeroinitializer +; SCALABLE-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 [[Y]], i32 0 +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]] +; SCALABLE-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; SCALABLE-NEXT: [[CMP1:%.*]] = fcmp fast olt float [[TMP13]], [[X]] +; SCALABLE-NEXT: [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]] +; SCALABLE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE: for.end: +; SCALABLE-NEXT: [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[COND_LCSSA]] ; entry: br label %for.body @@ -152,55 +253,101 @@ for.end: } define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { -; CHECK-LABEL: @select_const_i32_from_icmp +; CHECK-LABEL: define i32 @select_const_i32_from_icmp( +; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[NOT:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[TMP7]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 7, i32 3 +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 7, i32 3 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 3 +; CHECK-NEXT: [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 7 +; CHECK-NEXT: [[TMP13]] = add nuw nsw i64 [[TMP15]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[DOTLCSSA]] ; -; SCALABLE-LABEL: @select_const_i32_from_icmp +; SCALABLE-LABEL: define i32 @select_const_i32_from_icmp( +; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[NOT:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[NOT]] +; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 7, i64 0), poison, zeroinitializer) ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP9]]) -; SCALABLE-NEXT: [[FR:%.*]] = freeze i1 [[TMP13]] -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 7, i32 3 +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP9]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 7, i32 3 +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[TMP21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP21]] +; SCALABLE-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 3 +; SCALABLE-NEXT: [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 7 +; SCALABLE-NEXT: [[TMP19]] = add nuw nsw i64 [[TMP21]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]] +; SCALABLE-NEXT: br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE: exit: +; SCALABLE-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[DOTLCSSA]] ; entry: br label %for.body @@ -221,55 +368,113 @@ exit: ; preds = %for.body } define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 { -; CHECK-LABEL: @select_i32_from_icmp +; CHECK-LABEL: define i32 @select_i32_from_icmp( +; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[NOT:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[TMP7]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %b, i32 %a +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[B]], i32 [[A]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 3 +; CHECK-NEXT: [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 [[B]] +; CHECK-NEXT: [[TMP13]] = add nuw nsw i64 [[TMP15]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[DOTLCSSA]] ; -; SCALABLE-LABEL: @select_i32_from_icmp +; SCALABLE-LABEL: define i32 @select_i32_from_icmp( +; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 +; SCALABLE-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 +; SCALABLE-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[B]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[NOT:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[NOT]] +; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP9]]) -; SCALABLE-NEXT: [[FR:%.*]] = freeze i1 [[TMP13]] -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %b, i32 %a +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 +; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP9]], [[DOTSPLAT]] +; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[B]], i32 [[A]] +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[TMP21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP21]] +; SCALABLE-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 +; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 3 +; SCALABLE-NEXT: [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 [[B]] +; SCALABLE-NEXT: [[TMP19]] = add nuw nsw i64 [[TMP21]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]] +; SCALABLE-NEXT: br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE: exit: +; SCALABLE-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[DOTLCSSA]] ; entry: br label %for.body @@ -290,55 +495,101 @@ exit: ; preds = %for.body } define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 { -; CHECK-LABEL: @select_const_i32_from_fcmp +; CHECK-LABEL: define i32 @select_const_i32_from_fcmp( +; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], -; CHECK-NEXT: [[NOT:%.*]] = xor <4 x i1> [[TMP4]], -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[TMP7]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 2 +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 2 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP15:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00 +; CHECK-NEXT: [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP13]] = add nuw nsw i64 [[TMP15]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[DOTLCSSA]] ; -; SCALABLE-LABEL: @select_const_i32_from_fcmp +; SCALABLE-LABEL: define i32 @select_const_i32_from_fcmp( +; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP4]] +; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; SCALABLE-NEXT: [[TMP8:%.*]] = fcmp fast ueq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[NOT:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[NOT]] +; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP9]]) -; SCALABLE-NEXT: [[FR:%.*]] = freeze i1 [[TMP13]] -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 2 +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[TMP9]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 2 +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[TMP21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP21]] +; SCALABLE-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4 +; SCALABLE-NEXT: [[TMP17:%.*]] = fcmp fast ueq float [[TMP16]], 3.000000e+00 +; SCALABLE-NEXT: [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 1 +; SCALABLE-NEXT: [[TMP19]] = add nuw nsw i64 [[TMP21]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]] +; SCALABLE-NEXT: br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE: exit: +; SCALABLE-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[DOTLCSSA]] ; entry: br label %for.body @@ -359,11 +610,41 @@ exit: ; preds = %for.body } define float @select_const_f32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { -; CHECK-LABEL: @select_const_f32_from_icmp -; CHECK-NOT: vector.body +; CHECK-LABEL: define float @select_const_f32_from_icmp( +; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3 +; CHECK-NEXT: [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00 +; CHECK-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N]] +; CHECK-NEXT: br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]] +; CHECK: exit: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ] +; CHECK-NEXT: ret float [[DOTLCSSA]] ; -; SCALABLE-LABEL: @select_const_f32_from_icmp -; SCALABLE-NOT: vector.body +; SCALABLE-LABEL: define float @select_const_f32_from_icmp( +; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] +; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]] +; SCALABLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +; SCALABLE-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3 +; SCALABLE-NEXT: [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00 +; SCALABLE-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1 +; SCALABLE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N]] +; SCALABLE-NEXT: br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]] +; SCALABLE: exit: +; SCALABLE-NEXT: [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ] +; SCALABLE-NEXT: ret float [[DOTLCSSA]] ; entry: br label %for.body @@ -384,63 +665,127 @@ exit: ; preds = %for.body } define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) #0 { -; CHECK-LABEL: @pred_select_const_i32_from_icmp +; CHECK-LABEL: define i32 @pred_select_const_i32_from_icmp( +; CHECK-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[SRC2:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x i32> poison) -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], -; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i1> [[VEC_PHI]], [[TMP8]] -; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP9]], <4 x i1> [[VEC_PHI]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP5]], i32 4, <4 x i1> [[TMP3]], <4 x i32> poison) +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> , <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP7]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[PREDPHI]]) -; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[TMP12]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 0 +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[PREDPHI]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP9]], i32 1, i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP10]], 35 +; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP11]], 2 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[R_1_LCSSA]] ; -; SCALABLE-LABEL: @pred_select_const_i32_from_icmp +; SCALABLE-LABEL: define i32 @pred_select_const_i32_from_icmp( +; SCALABLE-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; SCALABLE-NEXT: entry: +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] -; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[TMP4]] -; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 -; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp sgt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 35, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[SRC2:%.*]], i64 [[TMP4]] -; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 -; SCALABLE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, [[TMP8]], poison) +; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 +; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp sgt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 35, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 +; SCALABLE-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, [[TMP9]], poison) ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) -; SCALABLE-NEXT: [[TMP13:%.*]] = or [[VEC_PHI]], [[TMP12]] -; SCALABLE-NEXT: [[PREDPHI]] = select [[TMP8]], [[TMP13]], [[VEC_PHI]] -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[TMP13:%.*]] = select [[TMP12]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer), [[VEC_PHI]] +; SCALABLE-NEXT: [[PREDPHI]] = select [[TMP9]], [[TMP13]], [[VEC_PHI]] +; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[PREDPHI]]) -; SCALABLE-NEXT: [[FR:%.*]] = freeze i1 [[TMP18]] -; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 0 +; SCALABLE-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne [[PREDPHI]], zeroinitializer +; SCALABLE-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[RDX_SELECT_CMP]]) +; SCALABLE-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP15]], i32 1, i32 0 +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE: scalar.ph: +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALABLE-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: br label [[FOR_BODY:%.*]] +; SCALABLE: for.body: +; SCALABLE-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; SCALABLE-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; SCALABLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]] +; SCALABLE-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; SCALABLE-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP16]], 35 +; SCALABLE-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; SCALABLE: if.then: +; SCALABLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]] +; SCALABLE-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; SCALABLE-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP17]], 2 +; SCALABLE-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]] +; SCALABLE-NEXT: br label [[FOR_INC]] +; SCALABLE: for.inc: +; SCALABLE-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] +; SCALABLE-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; SCALABLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; SCALABLE-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; SCALABLE: for.end.loopexit: +; SCALABLE-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; SCALABLE-NEXT: ret i32 [[R_1_LCSSA]] ; entry: br label %for.body @@ -472,3 +817,34 @@ for.end.loopexit: ; preds = %for.inc } attributes #0 = { "target-features"="+f,+v" } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +;. +; SCALABLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; SCALABLE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; SCALABLE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; SCALABLE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; SCALABLE: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; SCALABLE: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; SCALABLE: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; SCALABLE: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; SCALABLE: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; SCALABLE: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; SCALABLE: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; SCALABLE: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; SCALABLE: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; SCALABLE: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll index c721da7597b1c5..0b872709ec6c6e 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll @@ -19,20 +19,20 @@ define i32 @any_of_reduction_epilog(ptr %src, i64 %N) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i32> , <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[TMP7]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i32 1, i32 0 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: @@ -42,33 +42,32 @@ define i32 @any_of_reduction_epilog(ptr %src, i64 %N) { ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[BC_MERGE_RDX]], 0 ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]] -; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP9]], i64 0 -; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_MERGE_RDX]], i64 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX5]], 0 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer -; CHECK-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI6]], [[TMP13]] +; CHECK-NEXT: [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i32> , <4 x i32> [[VEC_PHI6]] ; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <4 x i32> [[TMP17]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]]) -; CHECK-NEXT: [[TMP17:%.*]] = freeze i1 [[TMP16]] -; CHECK-NEXT: [[RDX_SELECT9:%.*]] = select i1 [[TMP17]], i32 1, i32 0 +; CHECK-NEXT: [[RDX_SELECT9:%.*]] = select i1 [[TMP16]], i32 1, i32 0 ; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -102,104 +101,6 @@ exit: ret i32 %select } -define i32 @any_of_reduction_epilog_arg_as_start_value(ptr %src, i64 %N, i32 %start) { -; CHECK-LABEL: define i32 @any_of_reduction_epilog_arg_as_start_value( -; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) { -; CHECK-NEXT: iter.check: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; CHECK: vector.main.loop.iter.check: -; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[TMP7]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i32 1, i32 [[START]] -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] -; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] -; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ] -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[BC_MERGE_RDX]], [[START]] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4 -; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]] -; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP9]], i64 0 -; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX5]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer -; CHECK-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI6]], [[TMP13]] -; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]]) -; CHECK-NEXT: [[TMP17:%.*]] = freeze i1 [[TMP16]] -; CHECK-NEXT: [[RDX_SELECT9:%.*]] = select i1 [[TMP17]], i32 1, i32 [[START]] -; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] -; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i32 [ [[START]], [[ITER_CHECK]] ], [ [[START]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1 -; CHECK-NEXT: [[ICMP:%.*]] = icmp eq i8 [[LOAD]], 0 -; CHECK-NEXT: [[SELECT]] = select i1 [[ICMP]], i32 1, i32 [[RED]] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[ICMP3:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[ICMP3]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK: exit: -; CHECK-NEXT: [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i32 [[SELECT_LCSSA]] -; -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %red = phi i32 [ %start, %entry ], [ %select, %loop ] - %gep = getelementptr inbounds i8, ptr %src, i64 %iv - %load = load i8, ptr %gep, align 1 - %icmp = icmp eq i8 %load, 0 - %select = select i1 %icmp, i32 1, i32 %red - %iv.next = add i64 %iv, 1 - %icmp3 = icmp eq i64 %iv, %N - br i1 %icmp3, label %exit, label %loop - -exit: - ret i32 %select -} define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) { ; CHECK-LABEL: define i1 @any_of_reduction_i1_epilog( @@ -223,15 +124,14 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], -; CHECK-NEXT: [[TMP3]] = or <4 x i1> [[VEC_PHI]], [[TMP2]] +; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i1> [[VEC_PHI]], <4 x i1> zeroinitializer ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]]) -; CHECK-NEXT: [[TMP6:%.*]] = freeze i1 [[TMP5]] +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i1> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]]) ; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i1 false, i1 false ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -244,11 +144,10 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) { ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i1 [ false, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i1 [[BC_MERGE_RDX]], false ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]] ; CHECK-NEXT: [[IND_END5:%.*]] = trunc i64 [[N_VEC3]] to i32 -; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BC_MERGE_RDX]], i64 0 ; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer @@ -261,22 +160,21 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) { ; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND11:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[VEC_IND11]], [[BROADCAST_SPLAT14]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], -; CHECK-NEXT: [[TMP10]] = or <4 x i1> [[VEC_PHI10]], [[TMP9]] +; CHECK-NEXT: [[TMP10]] = select <4 x i1> [[TMP8]], <4 x i1> [[VEC_PHI10]], <4 x i1> zeroinitializer ; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX9]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT12]] = add <4 x i32> [[VEC_IND11]], ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]]) -; CHECK-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]] +; CHECK-NEXT: [[RDX_SELECT_CMP16:%.*]] = icmp ne <4 x i1> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP16]]) ; CHECK-NEXT: [[RDX_SELECT16:%.*]] = select i1 [[TMP13]], i1 false, i1 false ; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i1 [ false, [[ITER_CHECK]] ], [ false, [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i1 [ false, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -287,7 +185,7 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) { ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 ; CHECK-NEXT: [[CMP_2:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[CMP_2]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_2]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i1 [[SEL_LCSSA]] @@ -321,7 +219,4 @@ exit: ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll index 6a9f83a9e0aa2d..1b4bcf6a3739a1 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll @@ -1,114 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=CHECK-VF2IC1 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC2 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) { -; CHECK-VF2IC1-LABEL: @pred_select_const_i32_from_icmp( +; CHECK-VF2IC1-LABEL: define i32 @pred_select_const_i32_from_icmp( +; CHECK-VF2IC1-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF2IC1-NEXT: entry: +; CHECK-VF2IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-VF2IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF2IC1: vector.ph: +; CHECK-VF2IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; CHECK-VF2IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF2IC1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-VF2IC1: vector.body: -; CHECK-VF2IC1: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue2 ] -; CHECK-VF2IC1: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr {{%.*}}, align 4 -; CHECK-VF2IC1-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], -; CHECK-VF2IC1-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 -; CHECK-VF2IC1-NEXT: br i1 [[TMP5]], label %pred.load.if, label %pred.load.continue +; CHECK-VF2IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; CHECK-VF2IC1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; CHECK-VF2IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF2IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]] +; CHECK-VF2IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-VF2IC1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; CHECK-VF2IC1-NEXT: [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], +; CHECK-VF2IC1-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; CHECK-VF2IC1-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-VF2IC1: pred.load.if: -; CHECK-VF2IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC2:%.*]], i64 {{%.*}} -; CHECK-VF2IC1-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 -; CHECK-VF2IC1-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; CHECK-VF2IC1-NEXT: br label %pred.load.continue +; CHECK-VF2IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP0]] +; CHECK-VF2IC1-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 +; CHECK-VF2IC1-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0 +; CHECK-VF2IC1-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-VF2IC1: pred.load.continue: -; CHECK-VF2IC1-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, %vector.body ], [ [[TMP8]], %pred.load.if ] -; CHECK-VF2IC1-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 -; CHECK-VF2IC1-NEXT: br i1 [[TMP10]], label %pred.load.if1, label %pred.load.continue2 +; CHECK-VF2IC1-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] +; CHECK-VF2IC1-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; CHECK-VF2IC1-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] ; CHECK-VF2IC1: pred.load.if1: -; CHECK-VF2IC1: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 {{%.*}} -; CHECK-VF2IC1-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 -; CHECK-VF2IC1-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1 -; CHECK-VF2IC1-NEXT: br label %pred.load.continue2 +; CHECK-VF2IC1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF2IC1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP10]] +; CHECK-VF2IC1-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 +; CHECK-VF2IC1-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP12]], i32 1 +; CHECK-VF2IC1-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-VF2IC1: pred.load.continue2: -; CHECK-VF2IC1-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %pred.load.continue ], [ [[TMP14]], %pred.load.if1 ] -; CHECK-VF2IC1-NEXT: [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], -; CHECK-VF2IC1-NEXT: [[TMP17:%.*]] = or <2 x i1> [[VEC_PHI]], [[TMP16]] -; CHECK-VF2IC1-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP17]], <2 x i1> [[VEC_PHI]] -; CHECK-VF2IC1: br i1 {{%.*}}, label %middle.block, label %vector.body +; CHECK-VF2IC1-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ] +; CHECK-VF2IC1-NEXT: [[TMP15:%.*]] = icmp eq <2 x i32> [[TMP14]], +; CHECK-VF2IC1-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> , <2 x i32> [[VEC_PHI]] +; CHECK-VF2IC1-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP3]], <2 x i32> [[TMP16]], <2 x i32> [[VEC_PHI]] +; CHECK-VF2IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF2IC1-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF2IC1-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-VF2IC1: middle.block: -; CHECK-VF2IC1-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[PREDPHI]]) -; CHECK-VF2IC1-NEXT: [[FR_TMP20:%.*]] = freeze i1 [[TMP20]] -; CHECK-VF2IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR_TMP20]], i32 1, i32 0 +; CHECK-VF2IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i32> [[PREDPHI]], zeroinitializer +; CHECK-VF2IC1-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) +; CHECK-VF2IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP18]], i32 1, i32 0 +; CHECK-VF2IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF2IC1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF2IC1: scalar.ph: -; CHECK-VF2IC1: [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ] -; CHECK-VF2IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ] -; CHECK-VF2IC1-NEXT: br label %for.body +; CHECK-VF2IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF2IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF2IC1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF2IC1: for.body: -; CHECK-VF2IC1: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ] -; CHECK-VF2IC1: [[TMP21:%.*]] = load i32, ptr {{%.*}}, align 4 -; CHECK-VF2IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP21]], 35 -; CHECK-VF2IC1-NEXT: br i1 [[CMP1]], label %if.then, label %for.inc +; CHECK-VF2IC1-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-VF2IC1-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-VF2IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]] +; CHECK-VF2IC1-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-VF2IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35 +; CHECK-VF2IC1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK-VF2IC1: if.then: -; CHECK-VF2IC1: [[TMP22:%.*]] = load i32, ptr {{%.*}}, align 4 -; CHECK-VF2IC1-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP22]], 2 +; CHECK-VF2IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]] +; CHECK-VF2IC1-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-VF2IC1-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2 ; CHECK-VF2IC1-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]] -; CHECK-VF2IC1-NEXT: br label %for.inc +; CHECK-VF2IC1-NEXT: br label [[FOR_INC]] ; CHECK-VF2IC1: for.inc: -; CHECK-VF2IC1-NEXT: [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ] +; CHECK-VF2IC1-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] +; CHECK-VF2IC1-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; CHECK-VF2IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-VF2IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-VF2IC1: for.end.loopexit: -; CHECK-VF2IC1-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ] +; CHECK-VF2IC1-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; CHECK-VF2IC1-NEXT: ret i32 [[R_1_LCSSA]] ; -; CHECK-VF1IC2-LABEL: @pred_select_const_i32_from_icmp( +; CHECK-VF1IC2-LABEL: define i32 @pred_select_const_i32_from_icmp( +; CHECK-VF1IC2-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) { +; CHECK-VF1IC2-NEXT: entry: +; CHECK-VF1IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2 +; CHECK-VF1IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF1IC2: vector.ph: +; CHECK-VF1IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2 +; CHECK-VF1IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF1IC2-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-VF1IC2: vector.body: -; CHECK-VF1IC2: [[VEC_PHI:%.*]] = phi i1 [ false, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue3 ] -; CHECK-VF1IC2-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, %vector.ph ], [ [[PREDPHI5:%.*]], %pred.load.continue3 ] -; CHECK-VF1IC2: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 {{%.*}} -; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 {{%.*}} -; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4 -; CHECK-VF1IC2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-VF1IC2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 35 -; CHECK-VF1IC2-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 35 -; CHECK-VF1IC2-NEXT: br i1 [[TMP4]], label %pred.load.if, label %pred.load.continue +; CHECK-VF1IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE3:%.*]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE3]] ] +; CHECK-VF1IC2-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI4:%.*]], [[PRED_LOAD_CONTINUE3]] ] +; CHECK-VF1IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]] +; CHECK-VF1IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP1]] +; CHECK-VF1IC2-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4 +; CHECK-VF1IC2-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP4]], 35 +; CHECK-VF1IC2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], 35 +; CHECK-VF1IC2-NEXT: br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-VF1IC2: pred.load.if: -; CHECK-VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC2:%.*]], i64 {{%.*}} -; CHECK-VF1IC2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 -; CHECK-VF1IC2-NEXT: br label %pred.load.continue +; CHECK-VF1IC2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP0]] +; CHECK-VF1IC2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-VF1IC2-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-VF1IC2: pred.load.continue: -; CHECK-VF1IC2-NEXT: [[TMP8:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP7]], %pred.load.if ] -; CHECK-VF1IC2-NEXT: br i1 [[TMP5]], label %pred.load.if2, label %pred.load.continue3 +; CHECK-VF1IC2-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ] +; CHECK-VF1IC2-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3]] ; CHECK-VF1IC2: pred.load.if2: -; CHECK-VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 {{%.*}} -; CHECK-VF1IC2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 -; CHECK-VF1IC2-NEXT: br label %pred.load.continue3 +; CHECK-VF1IC2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP1]] +; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4 +; CHECK-VF1IC2-NEXT: br label [[PRED_LOAD_CONTINUE3]] ; CHECK-VF1IC2: pred.load.continue3: -; CHECK-VF1IC2-NEXT: [[TMP11:%.*]] = phi i32 [ poison, %pred.load.continue ], [ [[TMP10]], %pred.load.if2 ] -; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2 -; CHECK-VF1IC2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2 -; CHECK-VF1IC2-NEXT: [[TMP14:%.*]] = or i1 [[VEC_PHI]], [[TMP12]] -; CHECK-VF1IC2-NEXT: [[TMP15:%.*]] = or i1 [[VEC_PHI2]], [[TMP13]] -; CHECK-VF1IC2-NEXT: [[PREDPHI]] = select i1 [[TMP4]], i1 [[TMP14]], i1 [[VEC_PHI]] -; CHECK-VF1IC2-NEXT: [[PREDPHI5]] = select i1 [[TMP5]], i1 [[TMP15]], i1 [[VEC_PHI2]] -; CHECK-VF1IC2: br i1 {{%.*}}, label %middle.block, label %vector.body +; CHECK-VF1IC2-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF2]] ] +; CHECK-VF1IC2-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 2 +; CHECK-VF1IC2-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 2 +; CHECK-VF1IC2-NEXT: [[TMP16:%.*]] = select i1 [[TMP14]], i32 1, i32 [[VEC_PHI]] +; CHECK-VF1IC2-NEXT: [[TMP17:%.*]] = select i1 [[TMP15]], i32 1, i32 [[VEC_PHI1]] +; CHECK-VF1IC2-NEXT: [[PREDPHI]] = select i1 [[TMP6]], i32 [[TMP16]], i32 [[VEC_PHI]] +; CHECK-VF1IC2-NEXT: [[PREDPHI4]] = select i1 [[TMP7]], i32 [[TMP17]], i32 [[VEC_PHI1]] +; CHECK-VF1IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-VF1IC2-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF1IC2-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-VF1IC2: middle.block: -; CHECK-VF1IC2-NEXT: [[OR:%.*]] = or i1 [[PREDPHI5]], [[PREDPHI]] -; CHECK-VF1IC2-NEXT: [[FR_OR:%.*]] = freeze i1 [[OR]] -; CHECK-VF1IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR_OR]], i32 1, i32 0 -; CHECK-VF1IC2: br i1 {{%.*}}, label %for.end.loopexit, label %scalar.ph +; CHECK-VF1IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[PREDPHI]], 0 +; CHECK-VF1IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[PREDPHI]], i32 [[PREDPHI4]] +; CHECK-VF1IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF1IC2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-VF1IC2: scalar.ph: -; CHECK-VF1IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ] -; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ] -; CHECK-VF1IC2-NEXT: br label %for.body +; CHECK-VF1IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] +; CHECK-VF1IC2-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF1IC2: for.body: -; CHECK-VF1IC2-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %for.inc ], [ [[BC_RESUME_VAL]], %scalar.ph ] -; CHECK-VF1IC2-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ] -; CHECK-VF1IC2: [[TMP19:%.*]] = load i32, ptr {{%.*}}, align 4 +; CHECK-VF1IC2-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-VF1IC2-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-VF1IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]] +; CHECK-VF1IC2-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-VF1IC2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35 -; CHECK-VF1IC2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label %for.inc +; CHECK-VF1IC2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK-VF1IC2: if.then: -; CHECK-VF1IC2: [[TMP20:%.*]] = load i32, ptr {{%.*}}, align 4 +; CHECK-VF1IC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]] +; CHECK-VF1IC2-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-VF1IC2-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2 ; CHECK-VF1IC2-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]] -; CHECK-VF1IC2-NEXT: br label %for.inc +; CHECK-VF1IC2-NEXT: br label [[FOR_INC]] ; CHECK-VF1IC2: for.inc: -; CHECK-VF1IC2-NEXT: [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ] -; CHECK-VF1IC2: br i1 {{%.*}}, label %for.end.loopexit, label %for.body +; CHECK-VF1IC2-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ] +; CHECK-VF1IC2-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1 +; CHECK-VF1IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-VF1IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-VF1IC2: for.end.loopexit: -; CHECK-VF1IC2-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ] +; CHECK-VF1IC2-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ] ; CHECK-VF1IC2-NEXT: ret i32 [[R_1_LCSSA]] ; entry: @@ -139,3 +180,14 @@ for.end.loopexit: ; preds = %for.inc %r.1.lcssa = phi i32 [ %r.1, %for.inc ] ret i32 %r.1.lcssa } +;. +; CHECK-VF2IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF2IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF2IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF2IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. +; CHECK-VF1IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-VF1IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-VF1IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-VF1IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll index 993b56a05207be..c9f2aaef6d5c8e 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -5,47 +5,45 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) { ; CHECK-LABEL: @select_const_i32_from_icmp ; CHECK-VF4IC1: vector.body: -; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32> ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], -; CHECK-VF4IC1-NEXT: [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]]) -; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3 +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 ; CHECK-VF4IC4: vector.body: -; CHECK-VF4IC4: [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] -; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] -; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] -; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] +; CHECK-VF4IC4: [[VEC_PHI1:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] +; CHECK-VF4IC4-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] ; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp eq <4 x i32> {{.*}}, ; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq <4 x i32> {{.*}}, ; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq <4 x i32> {{.*}}, ; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq <4 x i32> {{.*}}, -; CHECK-VF4IC4-NEXT: [[NOT1:%.*]] = xor <4 x i1> [[VEC_ICMP1]], -; CHECK-VF4IC4-NEXT: [[NOT2:%.*]] = xor <4 x i1> [[VEC_ICMP2]], -; CHECK-VF4IC4-NEXT: [[NOT3:%.*]] = xor <4 x i1> [[VEC_ICMP3]], -; CHECK-VF4IC4-NEXT: [[NOT4:%.*]] = xor <4 x i1> [[VEC_ICMP4]], -; CHECK-VF4IC4-NEXT: [[VEC_SEL1:%.*]] = or <4 x i1> [[VEC_PHI1]], [[NOT1]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL2:%.*]] = or <4 x i1> [[VEC_PHI2]], [[NOT2]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL3:%.*]] = or <4 x i1> [[VEC_PHI3]], [[NOT3]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL4:%.*]] = or <4 x i1> [[VEC_PHI4]], [[NOT4]] +; CHECK-VF4IC4-NEXT: [[VEC_SEL1:%.*]] = select <4 x i1> [[VEC_ICMP1]], <4 x i32> [[VEC_PHI1]], <4 x i32> +; CHECK-VF4IC4-NEXT: [[VEC_SEL2:%.*]] = select <4 x i1> [[VEC_ICMP2]], <4 x i32> [[VEC_PHI2]], <4 x i32> +; CHECK-VF4IC4-NEXT: [[VEC_SEL3:%.*]] = select <4 x i1> [[VEC_ICMP3]], <4 x i32> [[VEC_PHI3]], <4 x i32> +; CHECK-VF4IC4-NEXT: [[VEC_SEL4:%.*]] = select <4 x i1> [[VEC_ICMP4]], <4 x i32> [[VEC_PHI4]], <4 x i32> ; CHECK-VF4IC4: middle.block: -; CHECK-VF4IC4-NEXT: [[VEC_SEL5:%.*]] = or <4 x i1> [[VEC_SEL2]], [[VEC_SEL1]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL6:%.*]] = or <4 x i1> [[VEC_SEL3]], [[VEC_SEL5]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL7:%.*]] = or <4 x i1> [[VEC_SEL4]], [[VEC_SEL6]] -; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL7]]) -; CHECK-VF4IC4-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3 +; CHECK-VF4IC4-NEXT: [[VEC_ICMP5:%.*]] = icmp ne <4 x i32> [[VEC_SEL1]], +; CHECK-VF4IC4-NEXT: [[VEC_SEL5:%.*]] = select <4 x i1> [[VEC_ICMP5]], <4 x i32> [[VEC_SEL1]], <4 x i32> [[VEC_SEL2]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP6:%.*]] = icmp ne <4 x i32> [[VEC_SEL5]], +; CHECK-VF4IC4-NEXT: [[VEC_SEL6:%.*]] = select <4 x i1> [[VEC_ICMP6]], <4 x i32> [[VEC_SEL5]], <4 x i32> [[VEC_SEL3]] +; CHECK-VF4IC4-NEXT: [[VEC_ICMP7:%.*]] = icmp ne <4 x i32> [[VEC_SEL6]], +; CHECK-VF4IC4-NEXT: [[VEC_SEL_FIN:%.*]] = select <4 x i1> [[VEC_ICMP7]], <4 x i32> [[VEC_SEL6]], <4 x i32> [[VEC_SEL4]] +; CHECK-VF4IC4-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL_FIN]], +; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 ; CHECK-VF1IC4: vector.body: -; CHECK-VF1IC4: [[VEC_PHI1:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] -; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] -; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] -; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] +; CHECK-VF1IC4: [[VEC_PHI1:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] +; CHECK-VF1IC4-NEXT: [[VEC_PHI4:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] ; CHECK-VF1IC4: [[VEC_LOAD1:%.*]] = load i32 ; CHECK-VF1IC4-NEXT: [[VEC_LOAD2:%.*]] = load i32 ; CHECK-VF1IC4-NEXT: [[VEC_LOAD3:%.*]] = load i32 @@ -54,20 +52,17 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) { ; CHECK-VF1IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq i32 [[VEC_LOAD2]], 3 ; CHECK-VF1IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq i32 [[VEC_LOAD3]], 3 ; CHECK-VF1IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq i32 [[VEC_LOAD4]], 3 -; CHECK-VF1IC4-NEXT: [[NOT1:%.*]] = xor i1 [[VEC_ICMP1]], true -; CHECK-VF1IC4-NEXT: [[NOT2:%.*]] = xor i1 [[VEC_ICMP2]], true -; CHECK-VF1IC4-NEXT: [[NOT3:%.*]] = xor i1 [[VEC_ICMP3]], true -; CHECK-VF1IC4-NEXT: [[NOT4:%.*]] = xor i1 [[VEC_ICMP4]], true -; CHECK-VF1IC4-NEXT: [[VEC_SEL1:%.*]] = or i1 [[VEC_PHI1]], [[NOT1]] -; CHECK-VF1IC4-NEXT: [[VEC_SEL2:%.*]] = or i1 [[VEC_PHI2]], [[NOT2]] -; CHECK-VF1IC4-NEXT: [[VEC_SEL3:%.*]] = or i1 [[VEC_PHI3]], [[NOT3]] -; CHECK-VF1IC4-NEXT: [[VEC_SEL4:%.*]] = or i1 [[VEC_PHI4]], [[NOT4]] +; CHECK-VF1IC4-NEXT: [[VEC_SEL1]] = select i1 [[VEC_ICMP1]], i32 [[VEC_PHI1]], i32 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL2]] = select i1 [[VEC_ICMP2]], i32 [[VEC_PHI2]], i32 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL3]] = select i1 [[VEC_ICMP3]], i32 [[VEC_PHI3]], i32 7 +; CHECK-VF1IC4-NEXT: [[VEC_SEL4]] = select i1 [[VEC_ICMP4]], i32 [[VEC_PHI4]], i32 7 ; CHECK-VF1IC4: middle.block: -; CHECK-VF1IC4-NEXT: [[VEC_SEL5:%.*]] = or i1 [[VEC_SEL2]], [[VEC_SEL1]] -; CHECK-VF1IC4-NEXT: [[VEC_SEL6:%.*]] = or i1 [[VEC_SEL3]], [[VEC_SEL5]] -; CHECK-VF1IC4-NEXT: [[OR_RDX:%.*]] = or i1 [[VEC_SEL4]], [[VEC_SEL6]] -; CHECK-VF1IC4-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF1IC4-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3 +; CHECK-VF1IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp ne i32 [[VEC_SEL1]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_SEL5:%.*]] = select i1 [[VEC_ICMP4]], i32 [[VEC_SEL1]], i32 [[VEC_SEL2]] +; CHECK-VF1IC4-NEXT: [[VEC_ICMP5:%.*]] = icmp ne i32 [[VEC_SEL5]], 3 +; CHECK-VF1IC4-NEXT: [[VEC_SEL6:%.*]] = select i1 [[VEC_ICMP5]], i32 [[VEC_SEL5]], i32 [[VEC_SEL3]] +; CHECK-VF1IC4-NEXT: [[VEC_ICMP6:%.*]] = icmp ne i32 [[VEC_SEL6]], 3 +; CHECK-VF1IC4-NEXT: {{.*}} = select i1 [[VEC_ICMP6]], i32 [[VEC_SEL6]], i32 [[VEC_SEL4]] entry: br label %for.body @@ -91,14 +86,14 @@ exit: ; preds = %for.body define i32 @select_const_i32_from_icmp2(ptr nocapture readonly %v, i64 %n) { ; CHECK-LABEL: @select_const_i32_from_icmp2 ; CHECK-VF4IC1: vector.body: -; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32> ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[VEC_ICMP]] +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> , <4 x i32> [[VEC_PHI]] ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]]) -; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3 +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3 entry: br label %for.body @@ -122,18 +117,21 @@ exit: ; preds = %for.body define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) { ; CHECK-LABEL: @select_i32_from_icmp ; CHECK-VF4IC1: vector.ph: -; CHECK-VF4IC1-NOT: shufflevector <4 x i32> -; CHECK-VF4IC1-NOT: shufflevector <4 x i32> +; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC1: vector.body: -; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x i32> ; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], -; CHECK-VF4IC1-NEXT: [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]] ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]]) -; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a +; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 +; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]] +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a entry: br label %for.body @@ -156,15 +154,14 @@ exit: ; preds = %for.body define i32 @select_const_i32_from_fcmp_fast(ptr nocapture readonly %v, i64 %n) { ; CHECK-LABEL: @select_const_i32_from_fcmp_fast ; CHECK-VF4IC1: vector.body: -; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x float> ; CHECK-VF4IC1-NEXT: [[VEC_FCMP:%.*]] = fcmp fast ueq <4 x float> [[VEC_LOAD]], -; CHECK-VF4IC1-NEXT: [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]]) -; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2 +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2 entry: br label %for.body @@ -187,15 +184,14 @@ exit: ; preds = %for.body define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) { ; CHECK-LABEL: @select_const_i32_from_fcmp ; CHECK-VF4IC1: vector.body: -; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ , %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <4 x float> ; CHECK-VF4IC1-NEXT: [[VEC_FCMP:%.*]] = fcmp ueq <4 x float> [[VEC_LOAD]], -; CHECK-VF4IC1-NEXT: [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]]) -; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2 +; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2 entry: br label %for.body @@ -220,16 +216,18 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) { ; CHECK-VF4IC1: vector.ph: ; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 ; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC1-NOT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0 +; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0 +; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-VF4IC1: vector.body: -; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] -; CHECK-VF4IC1: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[SPLAT_OF_A]], -; CHECK-VF4IC1-NEXT: [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] +; CHECK-VF4IC1: [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_PHI]], +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]] ; CHECK-VF4IC1: middle.block: -; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]]) -; CHECK-VF4IC1-NEXT: [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]] -; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a +; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 +; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]] +; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]]) +; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll index 55e61158a79c61..16ab45415b5cc5 100644 --- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll +++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll @@ -8,25 +8,26 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ undef, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[NOT:%*]] = xor <2 x i1> [[TMP3]], -; CHECK-NEXT: [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-NEXT: [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]]) -; CHECK-NEXT: [[FR_TMP6:%.*]] = freeze i1 [[TMP6]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 undef +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], undef +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 undef ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] @@ -71,25 +72,26 @@ define i64 @pr62565_incoming_value_known_poison(i64 %a, ptr %src) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ poison, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[NOT:%.*]] = xor <2 x i1> [[TMP3]], -; CHECK-NEXT: [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-NEXT: [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]]) -; CHECK-NEXT: [[FR_TMP6:%.*]] = freeze i1 [[TMP6]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 poison +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], poison +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 poison ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] @@ -134,25 +136,30 @@ define i64 @pr62565_incoming_value_may_be_poison(i64 %a, ptr %src, i64 %start) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0 +; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <2 x i64> [[MINMAX_IDENT_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[NOT:%.*]] = xor <2 x i1> [[TMP3]], -; CHECK-NEXT: [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]] +; CHECK-NEXT: [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]]) -; CHECK-NEXT: [[FR_TMP6:%.*]] = freeze i1 [[TMP6]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 [[START]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]]) +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 [[START]] ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] From 266b2a26408c42ed1ac84ef38dfc41695423d9da Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Wed, 10 Apr 2024 15:28:36 -0500 Subject: [PATCH 176/300] [ValueTracking] Add tests for `computeKnownFPClass` of `llvm.vector.reduce.{fmin,fmax,fmaximum,fminimum}`; NFC --- .../known-fpclass-reduce-signbit.ll | 134 ++++++++++++++++++ .../InstSimplify/known-never-infinity.ll | 112 +++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll diff --git a/llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll b/llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll new file mode 100644 index 00000000000000..2f3db14b2e1ac5 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt < %s -S -passes=instcombine | FileCheck %s + +define i1 @vector_reduce_maximum_signbit(<4 x double> nofpclass(nan nzero) %x) { +; CHECK-LABEL: define i1 @vector_reduce_maximum_signbit +; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) { +; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> [[X_ABS]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %x.abs) + %cmp = fcmp oge double %op, 0.0 + ret i1 %cmp +} + +define i1 @vector_reduce_maximum_signbit_fail_maybe_nan(<4 x double> nofpclass(nzero) %x) { +; CHECK-LABEL: define i1 @vector_reduce_maximum_signbit_fail_maybe_nan +; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) { +; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> [[X_ABS]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %x.abs) + %cmp = fcmp oge double %op, 0.0 + ret i1 %cmp +} + + +define i1 @vector_reduce_minimum_signbit(<4 x double> nofpclass(nan nzero) %x) { +; CHECK-LABEL: define i1 @vector_reduce_minimum_signbit +; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) { +; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> [[X_ABS]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %x.abs) + %cmp = fcmp oge double %op, 0.0 + ret i1 %cmp +} + +define i1 @vector_reduce_minimum_signbit_fail_maybe_nan(<4 x double> nofpclass(nzero) %x) { +; CHECK-LABEL: define i1 @vector_reduce_minimum_signbit_fail_maybe_nan +; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) { +; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> [[X_ABS]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %x.abs) + %cmp = fcmp oge double %op, 0.0 + ret i1 %cmp +} + +define i1 @vector_reduce_max_signbit(<4 x double> nofpclass(nan nzero) %x) { +; CHECK-LABEL: define i1 @vector_reduce_max_signbit +; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) { +; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[X_ABS]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x.abs) + %cmp = fcmp oge double %op, 0.0 + ret i1 %cmp +} + +define i1 @vector_reduce_max_signbit_fail_maybe_nan(<4 x double> nofpclass(nzero) %x) { +; CHECK-LABEL: define i1 @vector_reduce_max_signbit_fail_maybe_nan +; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) { +; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[X_ABS]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x.abs) + %cmp = fcmp oge double %op, 0.0 + ret i1 %cmp +} + + +define i1 @vector_reduce_min_signbit(<4 x double> nofpclass(nan nzero) %x) { +; CHECK-LABEL: define i1 @vector_reduce_min_signbit +; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) { +; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[X_ABS]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x.abs) + %cmp = fcmp oge double %op, 0.0 + ret i1 %cmp +} + +define i1 @vector_reduce_min_signbit_fail_maybe_nan(<4 x double> nofpclass(nzero) %x) { +; CHECK-LABEL: define i1 @vector_reduce_min_signbit_fail_maybe_nan +; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) { +; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[X_ABS]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x.abs) + %cmp = fcmp oge double %op, 0.0 + ret i1 %cmp +} + + + +define i1 @vector_reduce_min_signbit_nnan_from_fmf(<4 x double> nofpclass(nzero) %x) { +; CHECK-LABEL: define i1 @vector_reduce_min_signbit_nnan_from_fmf +; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) { +; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) +; CHECK-NEXT: [[OP:%.*]] = call nnan double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[X_ABS]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[CMP]] +; + %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) + %op = call nnan double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x.abs) + %cmp = fcmp oge double %op, 0.0 + ret i1 %cmp +} + + diff --git a/llvm/test/Transforms/InstSimplify/known-never-infinity.ll b/llvm/test/Transforms/InstSimplify/known-never-infinity.ll index 74039d3ffd56ca..470b56a8ef2ae3 100644 --- a/llvm/test/Transforms/InstSimplify/known-never-infinity.ll +++ b/llvm/test/Transforms/InstSimplify/known-never-infinity.ll @@ -1109,6 +1109,118 @@ define float @fcmp_ult_neginf_implies_class_assert(float %arg) { ret float %mul_by_zero } +define i1 @isKnownNeverInfinity_vector_reduce_maximum(<4 x double> %x) { +; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_maximum +; CHECK-SAME: (<4 x double> [[X:%.*]]) { +; CHECK-NEXT: [[NINF_X:%.*]] = fadd ninf <4 x double> [[X]], +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> [[NINF_X]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[CMP]] +; + %ninf.x = fadd ninf <4 x double> %x, + %op = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %ninf.x) + %cmp = fcmp une double %op, 0x7ff0000000000000 + ret i1 %cmp +} + +define i1 @isKnownNeverInfinity_vector_reduce_maximum_fail(<4 x double> %x) { +; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_maximum_fail +; CHECK-SAME: (<4 x double> [[X:%.*]]) { +; CHECK-NEXT: [[NINF_X:%.*]] = fadd <4 x double> [[X]], +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> [[NINF_X]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[CMP]] +; + %ninf.x = fadd <4 x double> %x, + %op = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %ninf.x) + %cmp = fcmp une double %op, 0x7ff0000000000000 + ret i1 %cmp +} + +define i1 @isKnownNeverInfinity_vector_reduce_minimum(<4 x double> %x) { +; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_minimum +; CHECK-SAME: (<4 x double> [[X:%.*]]) { +; CHECK-NEXT: [[NINF_X:%.*]] = fadd ninf <4 x double> [[X]], +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> [[NINF_X]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[CMP]] +; + %ninf.x = fadd ninf <4 x double> %x, + %op = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %ninf.x) + %cmp = fcmp une double %op, 0x7ff0000000000000 + ret i1 %cmp +} + +define i1 @isKnownNeverInfinity_vector_reduce_minimum_fail(<4 x double> %x) { +; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_minimum_fail +; CHECK-SAME: (<4 x double> [[X:%.*]]) { +; CHECK-NEXT: [[NINF_X:%.*]] = fadd <4 x double> [[X]], +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> [[NINF_X]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[CMP]] +; + %ninf.x = fadd <4 x double> %x, + %op = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %ninf.x) + %cmp = fcmp une double %op, 0x7ff0000000000000 + ret i1 %cmp +} + +define i1 @isKnownNeverInfinity_vector_reduce_fmax(<4 x double> %x) { +; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_fmax +; CHECK-SAME: (<4 x double> [[X:%.*]]) { +; CHECK-NEXT: [[NINF_X:%.*]] = fadd ninf <4 x double> [[X]], +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[NINF_X]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[CMP]] +; + %ninf.x = fadd ninf <4 x double> %x, + %op = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %ninf.x) + %cmp = fcmp une double %op, 0x7ff0000000000000 + ret i1 %cmp +} + +define i1 @isKnownNeverInfinity_vector_reduce_fmax_fail(<4 x double> %x) { +; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_fmax_fail +; CHECK-SAME: (<4 x double> [[X:%.*]]) { +; CHECK-NEXT: [[NINF_X:%.*]] = fadd <4 x double> [[X]], +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[NINF_X]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[CMP]] +; + %ninf.x = fadd <4 x double> %x, + %op = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %ninf.x) + %cmp = fcmp une double %op, 0x7ff0000000000000 + ret i1 %cmp +} + +define i1 @isKnownNeverInfinity_vector_reduce_fmin(<4 x double> %x) { +; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_fmin +; CHECK-SAME: (<4 x double> [[X:%.*]]) { +; CHECK-NEXT: [[NINF_X:%.*]] = fadd ninf <4 x double> [[X]], +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[NINF_X]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[CMP]] +; + %ninf.x = fadd ninf <4 x double> %x, + %op = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %ninf.x) + %cmp = fcmp une double %op, 0x7ff0000000000000 + ret i1 %cmp +} + +define i1 @isKnownNeverInfinity_vector_reduce_fmin_fail(<4 x double> %x) { +; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_fmin_fail +; CHECK-SAME: (<4 x double> [[X:%.*]]) { +; CHECK-NEXT: [[NINF_X:%.*]] = fadd <4 x double> [[X]], +; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[NINF_X]]) +; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[CMP]] +; + %ninf.x = fadd <4 x double> %x, + %op = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %ninf.x) + %cmp = fcmp une double %op, 0x7ff0000000000000 + ret i1 %cmp +} + declare double @llvm.arithmetic.fence.f64(double) declare double @llvm.canonicalize.f64(double) declare double @llvm.ceil.f64(double) From 9eeae4421198b99eab3ae9a4ff678fda26bbda2a Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Wed, 10 Apr 2024 14:35:09 -0500 Subject: [PATCH 177/300] [ValueTracking] Implement `computeKnownFPClass` for `llvm.vector.reduce.{fmin,fmax,fmaximum,fminimum}` Closes #88408 --- llvm/lib/Analysis/ValueTracking.cpp | 13 ++++++++++ .../known-fpclass-reduce-signbit.ll | 25 ++++--------------- .../InstSimplify/known-never-infinity.ll | 20 +++------------ 3 files changed, 22 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index e91dc07f31641b..ab2f43e1033fa1 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -5032,6 +5032,19 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts, break; } + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmin: + case Intrinsic::vector_reduce_fmaximum: + case Intrinsic::vector_reduce_fminimum: { + // reduce min/max will choose an element from one of the vector elements, + // so we can infer and class information that is common to all elements. + Known = computeKnownFPClass(II->getArgOperand(0), II->getFastMathFlags(), + InterestedClasses, Depth + 1, Q); + // Can only propagate sign if output is never NaN. + if (!Known.isKnownNeverNaN()) + Known.SignBit.reset(); + break; + } case Intrinsic::trunc: case Intrinsic::floor: case Intrinsic::ceil: diff --git a/llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll b/llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll index 2f3db14b2e1ac5..f46ea9db751ff4 100644 --- a/llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll +++ b/llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll @@ -4,10 +4,7 @@ define i1 @vector_reduce_maximum_signbit(<4 x double> nofpclass(nan nzero) %x) { ; CHECK-LABEL: define i1 @vector_reduce_maximum_signbit ; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) { -; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) -; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> [[X_ABS]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 true ; %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) %op = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %x.abs) @@ -33,10 +30,7 @@ define i1 @vector_reduce_maximum_signbit_fail_maybe_nan(<4 x double> nofpclass(n define i1 @vector_reduce_minimum_signbit(<4 x double> nofpclass(nan nzero) %x) { ; CHECK-LABEL: define i1 @vector_reduce_minimum_signbit ; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) { -; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) -; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> [[X_ABS]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 true ; %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) %op = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %x.abs) @@ -61,10 +55,7 @@ define i1 @vector_reduce_minimum_signbit_fail_maybe_nan(<4 x double> nofpclass(n define i1 @vector_reduce_max_signbit(<4 x double> nofpclass(nan nzero) %x) { ; CHECK-LABEL: define i1 @vector_reduce_max_signbit ; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) { -; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) -; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[X_ABS]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 true ; %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) %op = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x.abs) @@ -90,10 +81,7 @@ define i1 @vector_reduce_max_signbit_fail_maybe_nan(<4 x double> nofpclass(nzero define i1 @vector_reduce_min_signbit(<4 x double> nofpclass(nan nzero) %x) { ; CHECK-LABEL: define i1 @vector_reduce_min_signbit ; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) { -; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) -; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[X_ABS]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 true ; %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) %op = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x.abs) @@ -120,10 +108,7 @@ define i1 @vector_reduce_min_signbit_fail_maybe_nan(<4 x double> nofpclass(nzero define i1 @vector_reduce_min_signbit_nnan_from_fmf(<4 x double> nofpclass(nzero) %x) { ; CHECK-LABEL: define i1 @vector_reduce_min_signbit_nnan_from_fmf ; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) { -; CHECK-NEXT: [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]]) -; CHECK-NEXT: [[OP:%.*]] = call nnan double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[X_ABS]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 true ; %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) %op = call nnan double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x.abs) diff --git a/llvm/test/Transforms/InstSimplify/known-never-infinity.ll b/llvm/test/Transforms/InstSimplify/known-never-infinity.ll index 470b56a8ef2ae3..4d662c08b1a7a1 100644 --- a/llvm/test/Transforms/InstSimplify/known-never-infinity.ll +++ b/llvm/test/Transforms/InstSimplify/known-never-infinity.ll @@ -1112,10 +1112,7 @@ define float @fcmp_ult_neginf_implies_class_assert(float %arg) { define i1 @isKnownNeverInfinity_vector_reduce_maximum(<4 x double> %x) { ; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_maximum ; CHECK-SAME: (<4 x double> [[X:%.*]]) { -; CHECK-NEXT: [[NINF_X:%.*]] = fadd ninf <4 x double> [[X]], -; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> [[NINF_X]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 true ; %ninf.x = fadd ninf <4 x double> %x, %op = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %ninf.x) @@ -1140,10 +1137,7 @@ define i1 @isKnownNeverInfinity_vector_reduce_maximum_fail(<4 x double> %x) { define i1 @isKnownNeverInfinity_vector_reduce_minimum(<4 x double> %x) { ; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_minimum ; CHECK-SAME: (<4 x double> [[X:%.*]]) { -; CHECK-NEXT: [[NINF_X:%.*]] = fadd ninf <4 x double> [[X]], -; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> [[NINF_X]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 true ; %ninf.x = fadd ninf <4 x double> %x, %op = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %ninf.x) @@ -1168,10 +1162,7 @@ define i1 @isKnownNeverInfinity_vector_reduce_minimum_fail(<4 x double> %x) { define i1 @isKnownNeverInfinity_vector_reduce_fmax(<4 x double> %x) { ; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_fmax ; CHECK-SAME: (<4 x double> [[X:%.*]]) { -; CHECK-NEXT: [[NINF_X:%.*]] = fadd ninf <4 x double> [[X]], -; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[NINF_X]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 true ; %ninf.x = fadd ninf <4 x double> %x, %op = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %ninf.x) @@ -1196,10 +1187,7 @@ define i1 @isKnownNeverInfinity_vector_reduce_fmax_fail(<4 x double> %x) { define i1 @isKnownNeverInfinity_vector_reduce_fmin(<4 x double> %x) { ; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_fmin ; CHECK-SAME: (<4 x double> [[X:%.*]]) { -; CHECK-NEXT: [[NINF_X:%.*]] = fadd ninf <4 x double> [[X]], -; CHECK-NEXT: [[OP:%.*]] = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[NINF_X]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000 -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: ret i1 true ; %ninf.x = fadd ninf <4 x double> %x, %op = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %ninf.x) From d19bd05c79ad3b1a2c3cb439c3fc60825f66bed7 Mon Sep 17 00:00:00 2001 From: Hubert Tong Date: Tue, 16 Apr 2024 17:26:55 -0400 Subject: [PATCH 178/300] Clang Release Notes: Fix reST formatting Fix a use of inline code markup to have a non-word character after the ending delimiter as required by reST. --- clang/docs/ReleaseNotes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e6c345a2f5c0f5..4aedfafcb26aea 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -68,7 +68,7 @@ AST Dumping Potentially Breaking Changes Clang Frontend Potentially Breaking Changes ------------------------------------------- -- Removed support for constructing on-stack ``TemplateArgumentList``s; interfaces should instead +- Removed support for constructing on-stack ``TemplateArgumentList``\ s; interfaces should instead use ``ArrayRef`` to pass template arguments. Transitioning internal uses to ``ArrayRef`` reduces AST memory usage by 0.4% when compiling clang, and is expected to show similar improvements on other workloads. From 3074060d6a1d7d2e74cb767876bd9e5192d12007 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 16 Apr 2024 14:28:45 -0700 Subject: [PATCH 179/300] [memprof] Use SizeIs (NFC) (#88984) --- llvm/unittests/ProfileData/MemProfTest.cpp | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp index f596919ed039a8..7e00a80cacf933 100644 --- a/llvm/unittests/ProfileData/MemProfTest.cpp +++ b/llvm/unittests/ProfileData/MemProfTest.cpp @@ -183,13 +183,13 @@ TEST(MemProf, FillsValue) { // We expect 4 records. We attach alloc site data to foo and bar, i.e. // all frames bottom up until we find a non-inline frame. We attach call site // data to bar, xyz and abc. - ASSERT_EQ(Records.size(), 4U); + ASSERT_THAT(Records, SizeIs(4)); // Check the memprof record for foo. const llvm::GlobalValue::GUID FooId = IndexedMemProfRecord::getGUID("foo"); ASSERT_EQ(Records.count(FooId), 1U); const MemProfRecord &Foo = Records[FooId]; - ASSERT_EQ(Foo.AllocSites.size(), 1U); + ASSERT_THAT(Foo.AllocSites, SizeIs(1)); EXPECT_EQ(Foo.AllocSites[0].Info.getAllocCount(), 1U); EXPECT_THAT(Foo.AllocSites[0].CallStack[0], FrameContains("foo", 5U, 30U, true)); @@ -205,7 +205,7 @@ TEST(MemProf, FillsValue) { const llvm::GlobalValue::GUID BarId = IndexedMemProfRecord::getGUID("bar"); ASSERT_EQ(Records.count(BarId), 1U); const MemProfRecord &Bar = Records[BarId]; - ASSERT_EQ(Bar.AllocSites.size(), 1U); + ASSERT_THAT(Bar.AllocSites, SizeIs(1)); EXPECT_EQ(Bar.AllocSites[0].Info.getAllocCount(), 1U); EXPECT_THAT(Bar.AllocSites[0].CallStack[0], FrameContains("foo", 5U, 30U, true)); @@ -216,8 +216,8 @@ TEST(MemProf, FillsValue) { EXPECT_THAT(Bar.AllocSites[0].CallStack[3], FrameContains("abc", 5U, 30U, false)); - ASSERT_EQ(Bar.CallSites.size(), 1U); - ASSERT_EQ(Bar.CallSites[0].size(), 2U); + ASSERT_THAT(Bar.CallSites, SizeIs(1)); + ASSERT_THAT(Bar.CallSites[0], SizeIs(2)); EXPECT_THAT(Bar.CallSites[0][0], FrameContains("foo", 5U, 30U, true)); EXPECT_THAT(Bar.CallSites[0][1], FrameContains("bar", 51U, 20U, false)); @@ -225,8 +225,8 @@ TEST(MemProf, FillsValue) { const llvm::GlobalValue::GUID XyzId = IndexedMemProfRecord::getGUID("xyz"); ASSERT_EQ(Records.count(XyzId), 1U); const MemProfRecord &Xyz = Records[XyzId]; - ASSERT_EQ(Xyz.CallSites.size(), 1U); - ASSERT_EQ(Xyz.CallSites[0].size(), 2U); + ASSERT_THAT(Xyz.CallSites, SizeIs(1)); + ASSERT_THAT(Xyz.CallSites[0], SizeIs(2)); // Expect the entire frame even though in practice we only need the first // entry here. EXPECT_THAT(Xyz.CallSites[0][0], FrameContains("xyz", 5U, 30U, true)); @@ -237,8 +237,8 @@ TEST(MemProf, FillsValue) { ASSERT_EQ(Records.count(AbcId), 1U); const MemProfRecord &Abc = Records[AbcId]; EXPECT_TRUE(Abc.AllocSites.empty()); - ASSERT_EQ(Abc.CallSites.size(), 1U); - ASSERT_EQ(Abc.CallSites[0].size(), 2U); + ASSERT_THAT(Abc.CallSites, SizeIs(1)); + ASSERT_THAT(Abc.CallSites[0], SizeIs(2)); EXPECT_THAT(Abc.CallSites[0][0], FrameContains("xyz", 5U, 30U, true)); EXPECT_THAT(Abc.CallSites[0][1], FrameContains("abc", 5U, 30U, false)); } @@ -393,9 +393,9 @@ TEST(MemProf, SymbolizationFilter) { Records.push_back(KeyRecordPair.second); } - ASSERT_EQ(Records.size(), 1U); - ASSERT_EQ(Records[0].AllocSites.size(), 1U); - ASSERT_EQ(Records[0].AllocSites[0].CallStack.size(), 1U); + ASSERT_THAT(Records, SizeIs(1)); + ASSERT_THAT(Records[0].AllocSites, SizeIs(1)); + ASSERT_THAT(Records[0].AllocSites[0].CallStack, SizeIs(1)); EXPECT_THAT(Records[0].AllocSites[0].CallStack[0], FrameContains("foo", 5U, 30U, false)); } @@ -427,9 +427,9 @@ TEST(MemProf, BaseMemProfReader) { Records.push_back(KeyRecordPair.second); } - ASSERT_EQ(Records.size(), 1U); - ASSERT_EQ(Records[0].AllocSites.size(), 1U); - ASSERT_EQ(Records[0].AllocSites[0].CallStack.size(), 2U); + ASSERT_THAT(Records, SizeIs(1)); + ASSERT_THAT(Records[0].AllocSites, SizeIs(1)); + ASSERT_THAT(Records[0].AllocSites[0].CallStack, SizeIs(2)); EXPECT_THAT(Records[0].AllocSites[0].CallStack[0], FrameContains("foo", 20U, 5U, true)); EXPECT_THAT(Records[0].AllocSites[0].CallStack[1], From b1385dbd98e877a374ce303fd9d1774faf98e31b Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Tue, 16 Apr 2024 14:39:43 -0700 Subject: [PATCH 180/300] [libc][NFC] fix typo in fenv type proxy headers (#88982) libc.incude.fenv -> libc.include.fenv --- libc/hdr/types/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index ecb952b60cc061..f53766777e7530 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -28,7 +28,7 @@ add_proxy_header_library( fenv_t.h FULL_BUILD_DEPENDS libc.include.llvm-libc-types.fenv_t - libc.incude.fenv + libc.include.fenv ) add_proxy_header_library( @@ -37,5 +37,5 @@ add_proxy_header_library( fexcept_t.h FULL_BUILD_DEPENDS libc.include.llvm-libc-types.fexcept_t - libc.incude.fenv + libc.include.fenv ) From 8aa061ffc75adfab4b3084c918e7d4a3ccd5ba43 Mon Sep 17 00:00:00 2001 From: Peiming Liu Date: Tue, 16 Apr 2024 15:18:47 -0700 Subject: [PATCH 181/300] [mlir][sparse][NFC] switching to using `let argments/results` in td files (#88994) followed the same style used in "TensorOps.td". --- .../SparseTensor/IR/SparseTensorOps.td | 324 ++++++++++-------- 1 file changed, 174 insertions(+), 150 deletions(-) diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td index d7121e8320a4bc..4e4441c640ed95 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td +++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td @@ -27,9 +27,7 @@ class SparseTensor_Op traits = []> // Sparse Tensor Operations. //===----------------------------------------------------------------------===// -def SparseTensor_NewOp : SparseTensor_Op<"new", [Pure]>, - Arguments<(ins AnyType:$source)>, - Results<(outs AnySparseTensor:$result)> { +def SparseTensor_NewOp : SparseTensor_Op<"new", [Pure]> { string summary = "Materializes a new sparse tensor from given source"; string description = [{ Materializes a sparse tensor with contents taken from an opaque pointer @@ -51,15 +49,14 @@ def SparseTensor_NewOp : SparseTensor_Op<"new", [Pure]>, sparse_tensor.new %source : !Source to tensor<1024x1024xf64, #CSR> ``` }]; + + let arguments = (ins AnyType:$source); + let results = (outs AnySparseTensor:$result); let assemblyFormat = "$source attr-dict `:` type($source) `to` type($result)"; } -def SparseTensor_AssembleOp : SparseTensor_Op<"assemble", [Pure]>, - Arguments<(ins Variadic>:$levels, - TensorOf<[AnyType]>:$values)>, - Results<(outs AnySparseTensor: $result)> { +def SparseTensor_AssembleOp : SparseTensor_Op<"assemble", [Pure]> { let summary = "Returns a sparse tensor assembled from the given levels and values"; - let description = [{ Assembles the per-level position and coordinate arrays together with the values arrays into a sparse tensor. The order and types of the @@ -93,6 +90,9 @@ def SparseTensor_AssembleOp : SparseTensor_Op<"assemble", [Pure]>, ``` }]; + let arguments = (ins Variadic>:$levels, + TensorOf<[AnyType]>:$values); + let results = (outs AnySparseTensor: $result); let assemblyFormat = "` ` `(` $levels `)` `,` $values attr-dict `:`" " `(` type($levels) `)` `,` type($values) `to` type($result)"; @@ -100,16 +100,8 @@ def SparseTensor_AssembleOp : SparseTensor_Op<"assemble", [Pure]>, let hasVerifier = 1; } -def SparseTensor_DisassembleOp : SparseTensor_Op<"disassemble", [Pure, SameVariadicResultSize]>, - Arguments<(ins AnySparseTensor:$tensor, - Variadic>:$out_levels, - TensorOf<[AnyType]>:$out_values)>, - Results<(outs Variadic>:$ret_levels, - TensorOf<[AnyType]>:$ret_values, - Variadic:$lvl_lens, - AnyIndexingScalarLike:$val_len)> { +def SparseTensor_DisassembleOp : SparseTensor_Op<"disassemble", [Pure, SameVariadicResultSize]> { let summary = "Copies the levels and values of the given sparse tensor"; - let description = [{ The disassemble operation is the inverse of `sparse_tensor::assemble`. It copies the per-level position and coordinate arrays together with @@ -143,6 +135,13 @@ def SparseTensor_DisassembleOp : SparseTensor_Op<"disassemble", [Pure, SameVaria ``` }]; + let arguments = (ins AnySparseTensor:$tensor, + Variadic>:$out_levels, + TensorOf<[AnyType]>:$out_values); + let results = (outs Variadic>:$ret_levels, + TensorOf<[AnyType]>:$ret_values, + Variadic:$lvl_lens, + AnyIndexingScalarLike:$val_len); let assemblyFormat = "$tensor attr-dict `:` type($tensor)" "`out_lvls` `(` $out_levels `:` type($out_levels) `)` " @@ -154,9 +153,7 @@ def SparseTensor_DisassembleOp : SparseTensor_Op<"disassemble", [Pure, SameVaria } def SparseTensor_ConvertOp : SparseTensor_Op<"convert", - [Pure, StageWithSortSparseOpInterface]>, - Arguments<(ins AnyTensor:$source)>, - Results<(outs AnyTensor:$dest)> { + [Pure, StageWithSortSparseOpInterface]> { string summary = "Converts between different tensor types"; string description = [{ Converts one sparse or dense tensor type to another tensor type. The rank @@ -197,20 +194,22 @@ def SparseTensor_ConvertOp : SparseTensor_Op<"convert", }]; + let arguments = (ins AnyTensor:$source); + let results = (outs AnyTensor:$dest); + let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)"; + let extraClassDeclaration = [{ // Whether the convert can be done by a single step or it would require // an extra sort. Inherited from StageWithSortSparseOpInterface. bool needsExtraSort(); }]; - let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)"; let hasFolder = 1; let hasVerifier = 1; } -def SparseTensor_ReinterpretMapOp : SparseTensor_Op<"reinterpret_map", [NoMemoryEffect]>, - Arguments<(ins AnySparseTensor:$source)>, - Results<(outs AnySparseTensor:$dest)> { +def SparseTensor_ReinterpretMapOp : SparseTensor_Op<"reinterpret_map", + [NoMemoryEffect]> { let summary = "Reinterprets the dimension/level maps of the source tensor"; let description = [{ Reinterprets the dimension-to-level and level-to-dimension map specified in @@ -248,19 +247,20 @@ def SparseTensor_ReinterpretMapOp : SparseTensor_Op<"reinterpret_map", [NoMemory ``` }]; + let arguments = (ins AnySparseTensor:$source); + let results = (outs AnySparseTensor:$dest); + let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)"; + let builders = [ OpBuilder<(ins "SparseTensorEncodingAttr":$dstEnc, "Value":$source)> ]; - let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)"; let hasFolder = 1; let hasVerifier = 1; } def SparseTensor_ToPositionsOp : SparseTensor_Op<"positions", - [Pure, DeclareOpInterfaceMethods]>, - Arguments<(ins AnySparseTensor:$tensor, LevelAttr:$level)>, - Results<(outs AnyNon0RankedMemRef:$result)> { + [Pure, DeclareOpInterfaceMethods]> { let summary = "Extracts the `level`-th positions array of the `tensor`"; let description = [{ Returns the positions array of the tensor's storage at the given @@ -280,14 +280,16 @@ def SparseTensor_ToPositionsOp : SparseTensor_Op<"positions", : tensor<64x64xf64, #CSR> to memref ``` }]; + + let arguments = (ins AnySparseTensor:$tensor, LevelAttr:$level); + let results = (outs AnyNon0RankedMemRef:$result); let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($result)"; + let hasVerifier = 1; } def SparseTensor_ToCoordinatesOp : SparseTensor_Op<"coordinates", - [Pure, DeclareOpInterfaceMethods]>, - Arguments<(ins AnySparseTensor:$tensor, LevelAttr:$level)>, - Results<(outs AnyNon0RankedMemRef:$result)> { + [Pure, DeclareOpInterfaceMethods]> { let summary = "Extracts the `level`-th coordinates array of the `tensor`"; let description = [{ Returns the coordinates array of the tensor's storage at the given @@ -307,14 +309,16 @@ def SparseTensor_ToCoordinatesOp : SparseTensor_Op<"coordinates", : tensor<64x64xf64, #CSR> to memref ``` }]; + + let arguments = (ins AnySparseTensor:$tensor, LevelAttr:$level); + let results = (outs AnyNon0RankedMemRef:$result); let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($result)"; + let hasVerifier = 1; } def SparseTensor_ToCoordinatesBufferOp : SparseTensor_Op<"coordinates_buffer", - [Pure, DeclareOpInterfaceMethods]>, - Arguments<(ins AnySparseTensor:$tensor)>, - Results<(outs AnyNon0RankedMemRef:$result)> { + [Pure, DeclareOpInterfaceMethods]> { let summary = "Extracts the linear coordinates array from a tensor"; let description = [{ Returns the linear coordinates array for a sparse tensor with @@ -339,14 +343,16 @@ def SparseTensor_ToCoordinatesBufferOp : SparseTensor_Op<"coordinates_buffer", : tensor<64x64xf64, #COO> to memref ``` }]; + + let arguments = (ins AnySparseTensor:$tensor); + let results = (outs AnyNon0RankedMemRef:$result); let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($result)"; + let hasVerifier = 1; } def SparseTensor_ToValuesOp : SparseTensor_Op<"values", - [Pure, DeclareOpInterfaceMethods]>, - Arguments<(ins AnySparseTensor:$tensor)>, - Results<(outs AnyNon0RankedMemRef:$result)> { + [Pure, DeclareOpInterfaceMethods]> { let summary = "Extracts numerical values array from a tensor"; let description = [{ Returns the values array of the sparse storage format for the given @@ -365,13 +371,15 @@ def SparseTensor_ToValuesOp : SparseTensor_Op<"values", %1 = sparse_tensor.values %0 : tensor<64x64xf64, #CSR> to memref ``` }]; + + let arguments = (ins AnySparseTensor:$tensor); + let results = (outs AnyNon0RankedMemRef:$result); let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($result)"; + let hasVerifier = 1; } -def SparseTensor_NumberOfEntriesOp : SparseTensor_Op<"number_of_entries", [Pure]>, - Arguments<(ins AnySparseTensor:$tensor)>, - Results<(outs Index:$result)> { +def SparseTensor_NumberOfEntriesOp : SparseTensor_Op<"number_of_entries", [Pure]> { let summary = "Returns the number of entries that are stored in the tensor."; let description = [{ Returns the number of entries that are stored in the given sparse tensor. @@ -385,14 +393,14 @@ def SparseTensor_NumberOfEntriesOp : SparseTensor_Op<"number_of_entries", [Pure] %noe = sparse_tensor.number_of_entries %tensor : tensor<64x64xf64, #CSR> ``` }]; + + let arguments = (ins AnySparseTensor:$tensor); + let results = (outs Index:$result); let assemblyFormat = "$tensor attr-dict `:` type($tensor)"; } def SparseTensor_ConcatenateOp : SparseTensor_Op<"concatenate", - [Pure, StageWithSortSparseOpInterface]>, - Arguments<(ins Variadic:$inputs, DimensionAttr:$dimension)>, - Results<(outs AnyRankedTensor:$result)> { - + [Pure, StageWithSortSparseOpInterface]> { let summary = "Concatenates a list of tensors into a single tensor."; let description = [{ Concatenates a list input tensors and the output tensor with the same @@ -418,13 +426,14 @@ def SparseTensor_ConcatenateOp : SparseTensor_Op<"concatenate", bool needsExtraSort(); }]; + let arguments = (ins Variadic:$inputs, DimensionAttr:$dimension); + let results = (outs AnyRankedTensor:$result); let assemblyFormat = "$inputs attr-dict `:` type($inputs) `to` type($result)"; + let hasVerifier = 1; } -def SparseTensor_ToSliceOffsetOp : SparseTensor_Op<"slice.offset", [Pure]>, - Arguments<(ins AnySparseTensorSlice:$slice, IndexAttr:$dim)>, - Results<(outs Index:$offset)> { +def SparseTensor_ToSliceOffsetOp : SparseTensor_Op<"slice.offset", [Pure]> { let summary = "Extracts the offset of the sparse tensor slice at the given dimension"; let description = [{ Extracts the offset of the sparse tensor slice at the given dimension. @@ -445,13 +454,15 @@ def SparseTensor_ToSliceOffsetOp : SparseTensor_Op<"slice.offset", [Pure]>, // %2 = %v2 ``` }]; + + let arguments = (ins AnySparseTensorSlice:$slice, IndexAttr:$dim); + let results = (outs Index:$offset); let assemblyFormat = "$slice `at` $dim attr-dict `:` type($slice)"; + let hasVerifier = 1; } -def SparseTensor_ToSliceStrideOp : SparseTensor_Op<"slice.stride", [Pure]>, - Arguments<(ins AnySparseTensorSlice:$slice, IndexAttr:$dim)>, - Results<(outs Index:$stride)> { +def SparseTensor_ToSliceStrideOp : SparseTensor_Op<"slice.stride", [Pure]> { let summary = "Extracts the stride of the sparse tensor slice at the given dimension"; let description = [{ Extracts the stride of the sparse tensor slice at the given dimension. @@ -473,7 +484,11 @@ def SparseTensor_ToSliceStrideOp : SparseTensor_Op<"slice.stride", [Pure]>, ``` }]; + + let arguments = (ins AnySparseTensorSlice:$slice, IndexAttr:$dim); + let results = (outs Index:$stride); let assemblyFormat = "$slice `at` $dim attr-dict `:` type($slice)"; + let hasVerifier = 1; } @@ -482,9 +497,7 @@ def SparseTensor_ToSliceStrideOp : SparseTensor_Op<"slice.stride", [Pure]>, //===----------------------------------------------------------------------===// def SparseTensor_StorageSpecifierInitOp : SparseTensor_Op<"storage_specifier.init", - [Pure]>, - Arguments<(ins Optional:$source)>, - Results<(outs SparseTensorStorageSpecifier:$result)> { + [Pure]> { let summary = ""; let description = [{ Returns an initial storage specifier value. A storage specifier @@ -515,6 +528,10 @@ def SparseTensor_StorageSpecifierInitOp : SparseTensor_Op<"storage_specifier.ini ``` }]; + let arguments = (ins Optional:$source); + let results = (outs SparseTensorStorageSpecifier:$result); + let assemblyFormat = "attr-dict (`with` $source^)? `:` (`from` qualified(type($source))^ `to`)?" + " qualified(type($result))"; let builders = [ OpBuilder<(ins "Type":$result), [{ @@ -522,15 +539,10 @@ def SparseTensor_StorageSpecifierInitOp : SparseTensor_Op<"storage_specifier.ini }]> ]; - let assemblyFormat = "attr-dict (`with` $source^)? `:` (`from` qualified(type($source))^ `to`)?" - " qualified(type($result))"; + } -def SparseTensor_GetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.get", [Pure]>, - Arguments<(ins SparseTensorStorageSpecifier:$specifier, - SparseTensorStorageSpecifierKindAttr:$specifierKind, - OptionalAttr:$level)>, - Results<(outs Index:$result)> { +def SparseTensor_GetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.get", [Pure]> { let summary = ""; let description = [{ Returns the requested field of the given storage_specifier. @@ -543,19 +555,19 @@ def SparseTensor_GetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.get" ``` }]; + let arguments = (ins SparseTensorStorageSpecifier:$specifier, + SparseTensorStorageSpecifierKindAttr:$specifierKind, + OptionalAttr:$level); + let results = (outs Index:$result); let assemblyFormat = "$specifier $specifierKind (`at` $level^)? attr-dict" "`:` qualified(type($specifier))"; + let hasVerifier = 1; let hasFolder = 1; } def SparseTensor_SetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.set", - [Pure, AllTypesMatch<["result", "specifier"]>]>, - Arguments<(ins SparseTensorStorageSpecifier:$specifier, - SparseTensorStorageSpecifierKindAttr:$specifierKind, - OptionalAttr:$level, - Index:$value)>, - Results<(outs SparseTensorStorageSpecifier:$result)> { + [Pure, AllTypesMatch<["result", "specifier"]>]> { let summary = ""; let description = [{ Set the field of the storage specifier to the given input value. Returns @@ -568,8 +580,15 @@ def SparseTensor_SetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.set" : !sparse_tensor.storage_specifier<#COO> ``` }]; + + let arguments = (ins SparseTensorStorageSpecifier:$specifier, + SparseTensorStorageSpecifierKindAttr:$specifierKind, + OptionalAttr:$level, + Index:$value); + let results = (outs SparseTensorStorageSpecifier:$result); let assemblyFormat = "$specifier $specifierKind (`at` $level^)? `with` $value" " attr-dict `:` qualified(type($result))"; + let hasVerifier = 1; } @@ -577,9 +596,7 @@ def SparseTensor_SetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.set" // Sparse Tensor Coordinate Operations. //===----------------------------------------------------------------------===// -def SparseTensor_LvlOp : SparseTensor_Op<"lvl", [ConditionallySpeculatable, NoMemoryEffect]>, - Arguments<(ins AnySparseTensor:$source, Index:$index)>, - Results<(outs Index:$result)> { +def SparseTensor_LvlOp : SparseTensor_Op<"lvl", [ConditionallySpeculatable, NoMemoryEffect]> { let summary = "level index operation"; let description = [{ The `sparse_tensor.lvl` behaves similar to `tensor.dim` operation. @@ -615,9 +632,9 @@ def SparseTensor_LvlOp : SparseTensor_Op<"lvl", [ConditionallySpeculatable, NoMe ``` }]; - let assemblyFormat = [{ - attr-dict $source `,` $index `:` type($source) - }]; + let arguments = (ins AnySparseTensor:$source, Index:$index); + let results = (outs Index:$result); + let assemblyFormat = "attr-dict $source `,` $index `:` type($source) "; let builders = [ OpBuilder<(ins "Value":$source, "int64_t":$index)> @@ -635,11 +652,7 @@ def SparseTensor_LvlOp : SparseTensor_Op<"lvl", [ConditionallySpeculatable, NoMe let hasFolder = 1; } -def SparseTensor_CrdTranslateOp : SparseTensor_Op<"crd_translate", [Pure]>, - Arguments<(ins Variadic:$in_crds, - SparseTensorCrdTransDirectionAttr:$direction, - SparseTensorEncodingAttr:$encoder)>, - Results<(outs Variadic:$out_crds)> { +def SparseTensor_CrdTranslateOp : SparseTensor_Op<"crd_translate", [Pure]> { string summary = "Performs coordinate translation between level and dimension coordinate space."; string description = [{ Performs coordinate translation between level and dimension coordinate space according @@ -652,7 +665,13 @@ def SparseTensor_CrdTranslateOp : SparseTensor_Op<"crd_translate", [Pure]>, : index, index, index, index ``` }]; + + let arguments = (ins Variadic:$in_crds, + SparseTensorCrdTransDirectionAttr:$direction, + SparseTensorEncodingAttr:$encoder); + let results = (outs Variadic:$out_crds); let assemblyFormat = "$direction `[` $in_crds `]` `as` $encoder attr-dict `:` type($out_crds)"; + let hasVerifier = 1; let hasFolder = 1; } @@ -669,13 +688,7 @@ def SparseTensor_PushBackOp : SparseTensor_Op<"push_back", [TypesMatchWith<"value type matches element type of inBuffer", "inBuffer", "value", "::llvm::cast($_self).getElementType()">, - AllTypesMatch<["inBuffer", "outBuffer"]>]>, - Arguments<(ins Index:$curSize, - StridedMemRefRankOf<[AnyType], [1]>:$inBuffer, - AnyType:$value, Optional:$n, - UnitAttr:$inbounds)>, - Results<(outs StridedMemRefRankOf<[AnyType], [1]>:$outBuffer, - Index:$newSize)> { + AllTypesMatch<["inBuffer", "outBuffer"]>]> { string summary = "Pushes a value to the back of a given buffer"; string description = [{ Pushes `value` to the end of the given sparse tensor storage buffer @@ -719,6 +732,13 @@ def SparseTensor_PushBackOp : SparseTensor_Op<"push_back", : xindex, memref, f64 ``` }]; + + let arguments = (ins Index:$curSize, + StridedMemRefRankOf<[AnyType], [1]>:$inBuffer, + AnyType:$value, Optional:$n, + UnitAttr:$inbounds); + let results = (outs StridedMemRefRankOf<[AnyType], [1]>:$outBuffer, + Index:$newSize); let assemblyFormat = "(`inbounds` $inbounds^)? $curSize `,` $inBuffer" " `,` $value (`,` $n^ )? attr-dict `:`" " type($curSize) `,` type($inBuffer) `,`" @@ -732,12 +752,7 @@ def SparseTensor_PushBackOp : SparseTensor_Op<"push_back", let hasVerifier = 1; } -def SparseTensor_ExpandOp : SparseTensor_Op<"expand", []>, - Arguments<(ins AnySparseTensor:$tensor)>, - Results<(outs AnyStridedMemRefOfRank<1>:$values, - StridedMemRefRankOf<[I1],[1]>:$filled, - StridedMemRefRankOf<[Index],[1]>:$added, - Index:$count)> { +def SparseTensor_ExpandOp : SparseTensor_Op<"expand", []> { string summary = "Expands an access pattern for insertion"; string description = [{ Performs an access pattern expansion for the innermost levels of the @@ -771,19 +786,19 @@ def SparseTensor_ExpandOp : SparseTensor_Op<"expand", []>, : tensor<4x4xf64, #CSR> to memref, memref, memref ``` }]; + + + let arguments = (ins AnySparseTensor:$tensor); + let results = (outs AnyStridedMemRefOfRank<1>:$values, + StridedMemRefRankOf<[I1],[1]>:$filled, + StridedMemRefRankOf<[Index],[1]>:$added, + Index:$count); let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($values)" " `,` type($filled) `,` type($added)"; } def SparseTensor_CompressOp : SparseTensor_Op<"compress", - [AllTypesMatch<["tensor", "result"]>]>, - Arguments<(ins AnyStridedMemRefOfRank<1>:$values, - StridedMemRefRankOf<[I1],[1]>:$filled, - StridedMemRefRankOf<[Index],[1]>:$added, - Index:$count, - AnySparseTensor:$tensor, - Variadic:$lvlCoords)>, - Results<(outs AnySparseTensor:$result)> { + [AllTypesMatch<["tensor", "result"]>]> { string summary = "Compressed an access pattern for insertion"; string description = [{ Finishes a single access pattern expansion by moving inserted elements @@ -807,6 +822,14 @@ def SparseTensor_CompressOp : SparseTensor_Op<"compress", : memref, memref, memref, tensor<4x4xf64, #CSR> ``` }]; + + let arguments = (ins AnyStridedMemRefOfRank<1>:$values, + StridedMemRefRankOf<[I1],[1]>:$filled, + StridedMemRefRankOf<[Index],[1]>:$added, + Index:$count, + AnySparseTensor:$tensor, + Variadic:$lvlCoords); + let results = (outs AnySparseTensor:$result); let assemblyFormat = "$values `,` $filled `,` $added `,` $count" " `into` $tensor `[` $lvlCoords `]` attr-dict" " `:` type($values) `,` type($filled) `,` type($added)" @@ -814,9 +837,7 @@ def SparseTensor_CompressOp : SparseTensor_Op<"compress", let hasVerifier = 1; } -def SparseTensor_LoadOp : SparseTensor_Op<"load", [SameOperandsAndResultType]>, - Arguments<(ins AnySparseTensor:$tensor, UnitAttr:$hasInserts)>, - Results<(outs AnyTensor:$result)> { +def SparseTensor_LoadOp : SparseTensor_Op<"load", [SameOperandsAndResultType]> { let summary = "Rematerializes tensor from underlying sparse storage format"; let description = [{ @@ -845,11 +866,13 @@ def SparseTensor_LoadOp : SparseTensor_Op<"load", [SameOperandsAndResultType]>, %1 = sparse_tensor.load %0 hasInserts : tensor<16x32xf32, #CSR> ``` }]; + + let arguments = (ins AnySparseTensor:$tensor, UnitAttr:$hasInserts); + let results = (outs AnyTensor:$result); let assemblyFormat = "$tensor (`hasInserts` $hasInserts^)? attr-dict `:` type($tensor)"; } -def SparseTensor_OutOp : SparseTensor_Op<"out", []>, - Arguments<(ins AnySparseTensor:$tensor, AnyType:$dest)> { +def SparseTensor_OutOp : SparseTensor_Op<"out", []> { string summary = "Outputs a sparse tensor to the given destination"; string description = [{ Outputs the contents of a sparse tensor to the destination defined by an @@ -868,6 +891,8 @@ def SparseTensor_OutOp : SparseTensor_Op<"out", []>, sparse_tensor.out %t, %dest : tensor<1024x1024xf64, #CSR>, !Dest ``` }]; + + let arguments = (ins AnySparseTensor:$tensor, AnyType:$dest); let assemblyFormat = "$tensor `,` $dest attr-dict `:` type($tensor) `,` type($dest)"; } @@ -875,11 +900,7 @@ def SparseTensor_OutOp : SparseTensor_Op<"out", []>, // Sparse Tensor Sorting/Ordering Operations. //===----------------------------------------------------------------------===// -def SparseTensor_SortOp : SparseTensor_Op<"sort">, - Arguments<(ins Index:$n, StridedMemRefRankOf<[AnyInteger, Index], [1]>:$xy, - Variadic>:$ys, - AffineMapAttr:$perm_map, OptionalAttr:$ny, - SparseTensorSortKindAttr:$algorithm)> { +def SparseTensor_SortOp : SparseTensor_Op<"sort"> { let summary = "Sorts the arrays in xs and ys lexicographically on the " "integral values found in the xs list"; let description = [{ @@ -904,16 +925,18 @@ def SparseTensor_SortOp : SparseTensor_Op<"sort">, ``` }]; + let arguments = (ins Index:$n, + StridedMemRefRankOf<[AnyInteger, Index], [1]>:$xy, + Variadic>:$ys, + AffineMapAttr:$perm_map, OptionalAttr:$ny, + SparseTensorSortKindAttr:$algorithm); let assemblyFormat = "$algorithm $n" "`,`$xy (`jointly` $ys^)? attr-dict" "`:` type($xy) (`jointly` type($ys)^)?"; let hasVerifier = 1; } -def SparseTensor_ReorderCOOOp : SparseTensor_Op<"reorder_coo", [Pure]>, - Arguments<(ins AnySparseTensor: $input_coo, - SparseTensorSortKindAttr:$algorithm)>, - Results<(outs AnySparseTensor: $result_coo)> { +def SparseTensor_ReorderCOOOp : SparseTensor_Op<"reorder_coo", [Pure]> { let summary = "Reorder the input COO such that it has the the same order as " "the output COO"; let description = [{ @@ -933,6 +956,9 @@ def SparseTensor_ReorderCOOOp : SparseTensor_Op<"reorder_coo", [Pure]>, ``` }]; + let arguments = (ins AnySparseTensor: $input_coo, + SparseTensorSortKindAttr:$algorithm); + let results = (outs AnySparseTensor: $result_coo); let assemblyFormat = "$algorithm $input_coo attr-dict" "`:` type($input_coo) `to` type($result_coo)"; @@ -944,9 +970,7 @@ def SparseTensor_ReorderCOOOp : SparseTensor_Op<"reorder_coo", [Pure]>, // Sparse Tensor Syntax Operations. //===----------------------------------------------------------------------===// -def SparseTensor_BinaryOp : SparseTensor_Op<"binary", [Pure]>, - Arguments<(ins AnyType:$x, AnyType:$y, UnitAttr:$left_identity, UnitAttr:$right_identity)>, - Results<(outs AnyType:$output)> { +def SparseTensor_BinaryOp : SparseTensor_Op<"binary", [Pure]> { let summary = "Binary set operation utilized within linalg.generic"; let description = [{ Defines a computation within a `linalg.generic` operation that takes two @@ -1054,18 +1078,24 @@ def SparseTensor_BinaryOp : SparseTensor_Op<"binary", [Pure]>, }]; let regions = (region AnyRegion:$overlapRegion, AnyRegion:$leftRegion, AnyRegion:$rightRegion); + let arguments = (ins AnyType:$x, AnyType:$y, UnitAttr:$left_identity, UnitAttr:$right_identity); + let results = (outs AnyType:$output); let assemblyFormat = [{ $x `,` $y `:` attr-dict type($x) `,` type($y) `to` type($output) `\n` `overlap` `=` $overlapRegion `\n` `left` `=` (`identity` $left_identity^):($leftRegion)? `\n` `right` `=` (`identity` $right_identity^):($rightRegion)? }]; + let hasVerifier = 1; } -def SparseTensor_UnaryOp : SparseTensor_Op<"unary", [Pure]>, - Arguments<(ins AnyType:$x)>, - Results<(outs AnyType:$output)> { +def SparseTensor_UnaryOp : SparseTensor_Op<"unary", [Pure]> { + + let arguments = (ins AnyType:$x); + + let results = (outs AnyType:$output); + let summary = "Unary set operation utilized within linalg.generic"; let description = [{ Defines a computation with a `linalg.generic` operation that takes a single @@ -1162,9 +1192,7 @@ def SparseTensor_UnaryOp : SparseTensor_Op<"unary", [Pure]>, let hasVerifier = 1; } -def SparseTensor_ReduceOp : SparseTensor_Op<"reduce", [Pure, SameOperandsAndResultType]>, - Arguments<(ins AnyType:$x, AnyType:$y, AnyType:$identity)>, - Results<(outs AnyType:$output)> { +def SparseTensor_ReduceOp : SparseTensor_Op<"reduce", [Pure, SameOperandsAndResultType]> { let summary = "Custom reduction operation utilized within linalg.generic"; let description = [{ Defines a computation with a `linalg.generic` operation that takes two @@ -1208,16 +1236,14 @@ def SparseTensor_ReduceOp : SparseTensor_Op<"reduce", [Pure, SameOperandsAndResu }]; let regions = (region SizedRegion<1>:$region); + let arguments = (ins AnyType:$x, AnyType:$y, AnyType:$identity); + let results = (outs AnyType:$output); + let assemblyFormat = "$x `,` $y `,` $identity attr-dict `:` type($output) $region"; - let assemblyFormat = [{ - $x `,` $y `,` $identity attr-dict `:` type($output) $region - }]; let hasVerifier = 1; } -def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResultType]>, - Arguments<(ins AnyType:$x)>, - Results<(outs AnyType:$output)> { +def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResultType]> { let summary = "Select operation utilized within linalg.generic"; let description = [{ Defines an evaluation within a `linalg.generic` operation that takes a single @@ -1269,16 +1295,16 @@ def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResu }]; let regions = (region SizedRegion<1>:$region); - let assemblyFormat = [{ - $x attr-dict `:` type($x) $region - }]; + let arguments = (ins AnyType:$x); + let results = (outs AnyType:$output); + let assemblyFormat = "$x attr-dict `:` type($x) $region"; + let hasVerifier = 1; } def SparseTensor_YieldOp : SparseTensor_Op<"yield", [Pure, Terminator, ParentOneOf<["BinaryOp", "UnaryOp", "ReduceOp", "SelectOp", - "ForeachOp"]>]>, - Arguments<(ins Variadic:$results)> { + "ForeachOp"]>]> { let summary = "Yield from sparse_tensor set-like operations"; let description = [{ Yields a value from within a `binary`, `unary`, `reduce`, @@ -1319,17 +1345,12 @@ def SparseTensor_YieldOp : SparseTensor_Op<"yield", [Pure, Terminator, } }]; - let assemblyFormat = [{ - $results attr-dict `:` type($results) - }]; + let arguments = (ins Variadic:$results); + let assemblyFormat = "$results attr-dict `:` type($results)"; } def SparseTensor_ForeachOp : SparseTensor_Op<"foreach", - [SingleBlockImplicitTerminator<"YieldOp">]>, - Arguments<(ins AnyTensor:$tensor, - Variadic:$initArgs, - OptionalAttr:$order)>, - Results<(outs Variadic:$results)> { + [SingleBlockImplicitTerminator<"YieldOp">]> { let summary = "Iterates over elements in a tensor"; let description = [{ Iterates over stored elements in a tensor (which are typically, but not always, @@ -1424,6 +1445,10 @@ def SparseTensor_ForeachOp : SparseTensor_Op<"foreach", ]; let regions = (region SizedRegion<1>:$region); + let arguments = (ins AnyTensor:$tensor, + Variadic:$initArgs, + OptionalAttr:$order); + let results = (outs Variadic:$results); let assemblyFormat = "`in` $tensor (`init``(`$initArgs^`)`)? attr-dict" " `:` type($tensor) (`,` type($initArgs)^)?" " (`->` type($results)^)? `do` $region"; @@ -1436,13 +1461,6 @@ def SparseTensor_ForeachOp : SparseTensor_Op<"foreach", def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space", [Pure, DeclareOpInterfaceMethods]> { - - let arguments = (ins AnySparseTensor:$tensor, - Optional:$parentIter, - LevelAttr:$loLvl, LevelAttr:$hiLvl); - - let results = (outs AnySparseIterSpace:$resultSpace); - let summary = "Extracts an iteration space from a sparse tensor between certain levels"; let description = [{ Extracts a `!sparse_tensor.iter_space` from a sparse tensor between @@ -1485,17 +1503,21 @@ def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space", } }]; - let hasVerifier = 1; + let arguments = (ins AnySparseTensor:$tensor, + Optional:$parentIter, + LevelAttr:$loLvl, LevelAttr:$hiLvl); + let results = (outs AnySparseIterSpace:$resultSpace); let assemblyFormat = "$tensor (`at` $parentIter^)? `lvls` `=` custom($loLvl, $hiLvl) " " attr-dict `:` type($tensor) (`,` type($parentIter)^)?"; + + let hasVerifier = 1; } //===----------------------------------------------------------------------===// // Sparse Tensor Debugging and Test-Only Operations. //===----------------------------------------------------------------------===// -def SparseTensor_PrintOp : SparseTensor_Op<"print">, - Arguments<(ins AnySparseTensor:$tensor)> { +def SparseTensor_PrintOp : SparseTensor_Op<"print"> { string summary = "Prints a sparse tensor (for testing and debugging)"; string description = [{ Prints the individual components of a sparse tensors (the positions, @@ -1509,6 +1531,8 @@ def SparseTensor_PrintOp : SparseTensor_Op<"print">, sparse_tensor.print %tensor : tensor<1024x1024xf64, #CSR> ``` }]; + + let arguments = (ins AnySparseTensor:$tensor); let assemblyFormat = "$tensor attr-dict `:` type($tensor)"; } From 9067070d91e9d8cdd8509ffa56a076f08a3d7281 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 16 Apr 2024 15:40:32 -0700 Subject: [PATCH 182/300] [RISCV] Re-separate unaligned scalar and vector memory features in the backend. (#88954) This is largely a revert of commit e81796671890b59c110f8e41adc7ca26f8484d20. As #88029 shows, there exists hardware that only supports unaligned scalar. I'm leaving how this gets exposed to the clang interface to a future patch. --- clang/lib/Basic/Targets/RISCV.cpp | 3 ++- clang/lib/Driver/ToolChains/Arch/RISCV.cpp | 16 +++++++++++----- clang/test/Driver/riscv-features.c | 4 ++-- llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp | 4 ++-- llvm/lib/Target/RISCV/RISCVFeatures.td | 13 +++++++++---- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 16 ++++++++-------- llvm/lib/Target/RISCV/RISCVProcessors.td | 6 ++++-- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h | 4 ++-- llvm/test/CodeGen/RISCV/memcpy-inline.ll | 4 ++-- llvm/test/CodeGen/RISCV/memcpy.ll | 4 ++-- llvm/test/CodeGen/RISCV/memset-inline.ll | 4 ++-- llvm/test/CodeGen/RISCV/pr56110.ll | 2 +- .../CodeGen/RISCV/riscv-func-target-feature.ll | 2 +- .../RISCV/rvv/concat-vectors-constant-stride.ll | 4 ++-- .../rvv/fixed-vectors-strided-load-combine.ll | 2 +- .../CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll | 4 ++-- llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll | 4 ++-- llvm/test/CodeGen/RISCV/rvv/memset-inline.ll | 4 ++-- .../CodeGen/RISCV/rvv/unaligned-loads-stores.ll | 4 ++-- llvm/test/CodeGen/RISCV/unaligned-load-store.ll | 4 ++-- llvm/utils/TableGen/RISCVTargetDefEmitter.cpp | 12 ++++++++++-- 21 files changed, 71 insertions(+), 49 deletions(-) diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp index f3d705e1551fe2..a7ce9dda34bdde 100644 --- a/clang/lib/Basic/Targets/RISCV.cpp +++ b/clang/lib/Basic/Targets/RISCV.cpp @@ -353,7 +353,8 @@ bool RISCVTargetInfo::handleTargetFeatures(std::vector &Features, if (ISAInfo->hasExtension("zfh") || ISAInfo->hasExtension("zhinx")) HasLegalHalfType = true; - FastUnalignedAccess = llvm::is_contained(Features, "+fast-unaligned-access"); + FastUnalignedAccess = llvm::is_contained(Features, "+unaligned-scalar-mem") && + llvm::is_contained(Features, "+unaligned-vector-mem"); if (llvm::is_contained(Features, "+experimental")) HasExperimental = true; diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp index b1dd7c4372d475..96b3cc3bb8ffb1 100644 --- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp +++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp @@ -68,8 +68,10 @@ static void getRISCFeaturesFromMcpu(const Driver &D, const Arg *A, << A->getSpelling() << Mcpu; } - if (llvm::RISCV::hasFastUnalignedAccess(Mcpu)) - Features.push_back("+fast-unaligned-access"); + if (llvm::RISCV::hasFastUnalignedAccess(Mcpu)) { + Features.push_back("+unaligned-scalar-mem"); + Features.push_back("+unaligned-vector-mem"); + } } void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple, @@ -168,12 +170,16 @@ void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple, } // Android requires fast unaligned access on RISCV64. - if (Triple.isAndroid()) - Features.push_back("+fast-unaligned-access"); + if (Triple.isAndroid()) { + Features.push_back("+unaligned-scalar-mem"); + Features.push_back("+unaligned-vector-mem"); + } // -mstrict-align is default, unless -mno-strict-align is specified. AddTargetFeature(Args, Features, options::OPT_mno_strict_align, - options::OPT_mstrict_align, "fast-unaligned-access"); + options::OPT_mstrict_align, "unaligned-scalar-mem"); + AddTargetFeature(Args, Features, options::OPT_mno_strict_align, + options::OPT_mstrict_align, "unaligned-vector-mem"); // Now add any that the user explicitly requested on the command line, // which may override the defaults. diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c index ce4947d2bc47b4..5e1db5ba1ed3e9 100644 --- a/clang/test/Driver/riscv-features.c +++ b/clang/test/Driver/riscv-features.c @@ -38,8 +38,8 @@ // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS // RUN: %clang --target=riscv32-unknown-elf -### %s -mstrict-align 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS -// FAST-UNALIGNED-ACCESS: "-target-feature" "+fast-unaligned-access" -// NO-FAST-UNALIGNED-ACCESS: "-target-feature" "-fast-unaligned-access" +// FAST-UNALIGNED-ACCESS: "-target-feature" "+unaligned-scalar-mem" "-target-feature" "+unaligned-vector-mem" +// NO-FAST-UNALIGNED-ACCESS: "-target-feature" "-unaligned-scalar-mem" "-target-feature" "-unaligned-vector-mem" // RUN: %clang --target=riscv32-unknown-elf -### %s 2>&1 | FileCheck %s -check-prefix=NOUWTABLE // RUN: %clang --target=riscv32-unknown-elf -fasynchronous-unwind-tables -### %s 2>&1 | FileCheck %s -check-prefix=UWTABLE diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 173995f05b51cc..d93709ac03420e 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -326,8 +326,8 @@ bool RISCVExpandPseudo::expandRV32ZdinxStore(MachineBasicBlock &MBB, .setMemRefs(MMOLo); if (MBBI->getOperand(2).isGlobal() || MBBI->getOperand(2).isCPI()) { - // FIXME: Zdinx RV32 can not work on unaligned memory. - assert(!STI->hasFastUnalignedAccess()); + // FIXME: Zdinx RV32 can not work on unaligned scalar memory. + assert(!STI->enableUnalignedScalarMem()); assert(MBBI->getOperand(2).getOffset() % 8 == 0); MBBI->getOperand(2).setOffset(MBBI->getOperand(2).getOffset() + 4); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 59962216e0c041..561187c39a4a04 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1183,10 +1183,15 @@ def FeatureTrailingSeqCstFence : SubtargetFeature<"seq-cst-trailing-fence", "true", "Enable trailing fence for seq-cst store.">; -def FeatureFastUnalignedAccess - : SubtargetFeature<"fast-unaligned-access", "HasFastUnalignedAccess", - "true", "Has reasonably performant unaligned " - "loads and stores (both scalar and vector)">; +def FeatureUnalignedScalarMem + : SubtargetFeature<"unaligned-scalar-mem", "EnableUnalignedScalarMem", + "true", "Has reasonably performant unaligned scalar " + "loads and stores">; + +def FeatureUnalignedVectorMem + : SubtargetFeature<"unaligned-vector-mem", "EnableUnalignedVectorMem", + "true", "Has reasonably performant unaligned vector " + "loads and stores">; def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler", "UsePostRAScheduler", "true", "Schedule again after register allocation">; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 7b4bec2f65b741..b0deb1d2669952 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1924,7 +1924,7 @@ bool RISCVTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, // replace. If we don't support unaligned scalar mem, prefer the constant // pool. // TODO: Can the caller pass down the alignment? - if (!Subtarget.hasFastUnalignedAccess()) + if (!Subtarget.enableUnalignedScalarMem()) return true; // Prefer to keep the load if it would require many instructions. @@ -15837,7 +15837,7 @@ static bool matchIndexAsWiderOp(EVT VT, SDValue Index, SDValue Mask, if (WiderElementSize > ST.getELen()/8) return false; - if (!ST.hasFastUnalignedAccess() && BaseAlign < WiderElementSize) + if (!ST.enableUnalignedVectorMem() && BaseAlign < WiderElementSize) return false; for (unsigned i = 0; i < Index->getNumOperands(); i++) { @@ -20663,8 +20663,8 @@ bool RISCVTargetLowering::allowsMisalignedMemoryAccesses( unsigned *Fast) const { if (!VT.isVector()) { if (Fast) - *Fast = Subtarget.hasFastUnalignedAccess(); - return Subtarget.hasFastUnalignedAccess(); + *Fast = Subtarget.enableUnalignedScalarMem(); + return Subtarget.enableUnalignedScalarMem(); } // All vector implementations must support element alignment @@ -20680,8 +20680,8 @@ bool RISCVTargetLowering::allowsMisalignedMemoryAccesses( // misaligned accesses. TODO: Work through the codegen implications of // allowing such accesses to be formed, and considered fast. if (Fast) - *Fast = Subtarget.hasFastUnalignedAccess(); - return Subtarget.hasFastUnalignedAccess(); + *Fast = Subtarget.enableUnalignedVectorMem(); + return Subtarget.enableUnalignedVectorMem(); } @@ -20716,7 +20716,7 @@ EVT RISCVTargetLowering::getOptimalMemOpType(const MemOp &Op, // Do we have sufficient alignment for our preferred VT? If not, revert // to largest size allowed by our alignment criteria. - if (PreferredVT != MVT::i8 && !Subtarget.hasFastUnalignedAccess()) { + if (PreferredVT != MVT::i8 && !Subtarget.enableUnalignedVectorMem()) { Align RequiredAlign(PreferredVT.getStoreSize()); if (Op.isFixedDstAlign()) RequiredAlign = std::min(RequiredAlign, Op.getDstAlign()); @@ -20908,7 +20908,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, if (!isLegalElementTypeForRVV(ScalarType)) return false; - if (!Subtarget.hasFastUnalignedAccess() && + if (!Subtarget.enableUnalignedVectorMem() && Alignment < ScalarType.getStoreSize()) return false; diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 739b50749e1323..f9a557e02bfe1a 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -257,7 +257,8 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model, FeatureStdExtZbb, FeatureStdExtZbs, FeatureStdExtZfhmin, - FeatureFastUnalignedAccess], + FeatureUnalignedScalarMem, + FeatureUnalignedVectorMem], [TuneNoDefaultUnroll, TuneConditionalCompressedMoveFusion, TuneLUIADDIFusion, @@ -295,7 +296,8 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model, FeatureStdExtZvkng, FeatureStdExtZvksc, FeatureStdExtZvksg, - FeatureFastUnalignedAccess], + FeatureUnalignedScalarMem, + FeatureUnalignedVectorMem], [TuneNoDefaultUnroll, TuneConditionalCompressedMoveFusion, TuneLUIADDIFusion, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index e0c0e6517b6f1f..2f9281ab892447 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -228,7 +228,7 @@ class RISCVTTIImpl : public BasicTTIImplBase { return false; EVT ElemType = DataTypeVT.getScalarType(); - if (!ST->hasFastUnalignedAccess() && Alignment < ElemType.getStoreSize()) + if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; return TLI->isLegalElementTypeForRVV(ElemType); @@ -253,7 +253,7 @@ class RISCVTTIImpl : public BasicTTIImplBase { return false; EVT ElemType = DataTypeVT.getScalarType(); - if (!ST->hasFastUnalignedAccess() && Alignment < ElemType.getStoreSize()) + if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; return TLI->isLegalElementTypeForRVV(ElemType); diff --git a/llvm/test/CodeGen/RISCV/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/memcpy-inline.ll index 343695ee37da84..833e07351eec77 100644 --- a/llvm/test/CodeGen/RISCV/memcpy-inline.ll +++ b/llvm/test/CodeGen/RISCV/memcpy-inline.ll @@ -3,9 +3,9 @@ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32 ; RUN: llc < %s -mtriple=riscv64 \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64 -; RUN: llc < %s -mtriple=riscv32 -mattr=+fast-unaligned-access \ +; RUN: llc < %s -mtriple=riscv32 -mattr=+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST -; RUN: llc < %s -mtriple=riscv64 -mattr=+fast-unaligned-access \ +; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST ; ---------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll index 12ec0881b20d9f..02f582339d0b78 100644 --- a/llvm/test/CodeGen/RISCV/memcpy.ll +++ b/llvm/test/CodeGen/RISCV/memcpy.ll @@ -3,9 +3,9 @@ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32 ; RUN: llc < %s -mtriple=riscv64 \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64 -; RUN: llc < %s -mtriple=riscv32 -mattr=+fast-unaligned-access \ +; RUN: llc < %s -mtriple=riscv32 -mattr=+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST -; RUN: llc < %s -mtriple=riscv64 -mattr=+fast-unaligned-access \ +; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } diff --git a/llvm/test/CodeGen/RISCV/memset-inline.ll b/llvm/test/CodeGen/RISCV/memset-inline.ll index cc22b77c641e27..55fe81a58805ed 100644 --- a/llvm/test/CodeGen/RISCV/memset-inline.ll +++ b/llvm/test/CodeGen/RISCV/memset-inline.ll @@ -3,9 +3,9 @@ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64 -; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+fast-unaligned-access \ +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+fast-unaligned-access \ +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+unaligned-scalar-mem \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } diff --git a/llvm/test/CodeGen/RISCV/pr56110.ll b/llvm/test/CodeGen/RISCV/pr56110.ll index c795b17419f564..fa441f5fc3aef4 100644 --- a/llvm/test/CodeGen/RISCV/pr56110.ll +++ b/llvm/test/CodeGen/RISCV/pr56110.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=riscv32 | FileCheck %s -; RUN: llc < %s -mtriple=riscv32 -mattr=+fast-unaligned-access | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+unaligned-scalar-mem | FileCheck %s define void @foo_set(ptr nocapture noundef %a, i32 noundef %v) { ; CHECK-LABEL: foo_set: diff --git a/llvm/test/CodeGen/RISCV/riscv-func-target-feature.ll b/llvm/test/CodeGen/RISCV/riscv-func-target-feature.ll index a03dadbc1d1160..d627ae9c90394e 100644 --- a/llvm/test/CodeGen/RISCV/riscv-func-target-feature.ll +++ b/llvm/test/CodeGen/RISCV/riscv-func-target-feature.ll @@ -36,7 +36,7 @@ entry: } ; CHECK-NOT: .option push -define void @test5() "target-features"="+fast-unaligned-access" { +define void @test5() "target-features"="+unaligned-scalar-mem" { ; CHECK-LABEL: test5 ; CHECK-NOT: .option pop entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll b/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll index f244810e739d93..ff35043dbd7e75 100644 --- a/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll +++ b/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+fast-unaligned-access -target-abi=ilp32 \ +; RUN: llc -mtriple=riscv32 -mattr=+v,+unaligned-vector-mem -target-abi=ilp32 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v,+fast-unaligned-access -target-abi=lp64 \ +; RUN: llc -mtriple=riscv64 -mattr=+v,+unaligned-vector-mem -target-abi=lp64 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 define void @constant_forward_stride(ptr %s, ptr %d) { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll index 657d52354aa39f..f0fcc482e2207e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64 -; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+fast-unaligned-access -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN +; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN ; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll index fffc4d6c08335c..36c36a13964c92 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -3,9 +3,9 @@ ; RUN: | FileCheck %s --check-prefixes=SLOW,RV32-SLOW ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s \ ; RUN: | FileCheck %s --check-prefixes=SLOW,RV64-SLOW -; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+fast-unaligned-access -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+unaligned-vector-mem -verify-machineinstrs < %s \ ; RUN: | FileCheck %s --check-prefixes=FAST,RV32-FAST -; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+fast-unaligned-access -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem -verify-machineinstrs < %s \ ; RUN: | FileCheck %s --check-prefixes=FAST,RV64-FAST define <4 x i32> @load_v4i32_align1(ptr %ptr) { diff --git a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll index 485f94ee2a1026..53598c609107b0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll +++ b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll @@ -3,9 +3,9 @@ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64 -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+fast-unaligned-access \ +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+unaligned-scalar-mem,+unaligned-vector-mem \ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+fast-unaligned-access \ +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+unaligned-scalar-mem,+unaligned-vector-mem \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST ; ---------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll index 0e7e914cf68e8a..accc18519d6260 100644 --- a/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll +++ b/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll @@ -3,9 +3,9 @@ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64 -; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+fast-unaligned-access \ +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+unaligned-scalar-mem,,+unaligned-vector-mem \ ; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+fast-unaligned-access \ +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+unaligned-scalar-mem,+unaligned-vector-mem \ ; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } diff --git a/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll b/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll index f488baf5a9d9fe..1491bb6c337a02 100644 --- a/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll +++ b/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll @@ -3,9 +3,9 @@ ; RUN: -verify-machineinstrs | FileCheck %s ; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+zvfh,+v < %s \ ; RUN: -verify-machineinstrs | FileCheck %s -; RUN: llc -mtriple riscv32 -mattr=+d,+zfh,+zvfh,+v,+fast-unaligned-access < %s \ +; RUN: llc -mtriple riscv32 -mattr=+d,+zfh,+zvfh,+v,+unaligned-vector-mem < %s \ ; RUN: -verify-machineinstrs | FileCheck --check-prefix=FAST %s -; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+zvfh,+v,+fast-unaligned-access < %s \ +; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+zvfh,+v,+unaligned-vector-mem < %s \ ; RUN: -verify-machineinstrs | FileCheck --check-prefix=FAST %s diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll index 599b0d08629eaf..ce0d8fedbfb88f 100644 --- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll +++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll @@ -3,9 +3,9 @@ ; RUN: | FileCheck -check-prefixes=ALL,SLOW,RV32I %s ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=ALL,SLOW,RV64I %s -; RUN: llc -mtriple=riscv32 -mattr=+fast-unaligned-access -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv32 -mattr=+unaligned-scalar-mem -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=ALL,FAST,RV32I-FAST %s -; RUN: llc -mtriple=riscv64 -mattr=+fast-unaligned-access -verify-machineinstrs < %s \ +; RUN: llc -mtriple=riscv64 -mattr=+unaligned-scalar-mem -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=ALL,FAST,RV64I-FAST %s ; A collection of cases showing codegen for unaligned loads and stores diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp index 7a6439cb94910e..e57bc6fb507e32 100644 --- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp +++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp @@ -60,11 +60,19 @@ static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) { if (MArch.empty()) MArch = getMArch(*Rec); - const bool FastUnalignedAccess = + bool FastScalarUnalignedAccess = any_of(Rec->getValueAsListOfDefs("Features"), [&](auto &Feature) { - return Feature->getValueAsString("Name") == "fast-unaligned-access"; + return Feature->getValueAsString("Name") == "unaligned-scalar-mem"; }); + bool FastVectorUnalignedAccess = + any_of(Rec->getValueAsListOfDefs("Features"), [&](auto &Feature) { + return Feature->getValueAsString("Name") == "unaligned-vector-mem"; + }); + + bool FastUnalignedAccess = + FastScalarUnalignedAccess && FastVectorUnalignedAccess; + OS << "PROC(" << Rec->getName() << ", " << "{\"" << Rec->getValueAsString("Name") << "\"}, " << "{\"" << MArch << "\"}, " << FastUnalignedAccess << ")\n"; From 988ffd06722e7e056b239efe497345ac97be33db Mon Sep 17 00:00:00 2001 From: Usama Hameed Date: Tue, 16 Apr 2024 16:00:14 -0700 Subject: [PATCH 183/300] Add asan tests for libsanitizers. (#88349) (#88962) The previous patch was reverted because the test fails to build when libsanitizers is not present. This patch catches the BuildError exception and skips the test appropriately. This patch tests LLDB integration with libsanitizers for ASan. rdar://111856681 --- lldb/test/API/functionalities/asan/Makefile | 6 +- .../functionalities/asan/TestMemoryHistory.py | 74 ++++++++++++++++++- .../functionalities/asan/TestReportData.py | 21 +++++- 3 files changed, 94 insertions(+), 7 deletions(-) diff --git a/lldb/test/API/functionalities/asan/Makefile b/lldb/test/API/functionalities/asan/Makefile index 4913a18d8cc6f9..d66696fed7078f 100644 --- a/lldb/test/API/functionalities/asan/Makefile +++ b/lldb/test/API/functionalities/asan/Makefile @@ -1,4 +1,8 @@ C_SOURCES := main.c -CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info +asan: CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info +asan: all + +libsanitizers: CFLAGS_EXTRAS := -fsanitize=address -fsanitize-stable-abi -g -gcolumn-info +libsanitizers: all include Makefile.rules diff --git a/lldb/test/API/functionalities/asan/TestMemoryHistory.py b/lldb/test/API/functionalities/asan/TestMemoryHistory.py index 00162ae8822c74..41ab25823f5cc6 100644 --- a/lldb/test/API/functionalities/asan/TestMemoryHistory.py +++ b/lldb/test/API/functionalities/asan/TestMemoryHistory.py @@ -8,16 +8,24 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbplatform from lldbsuite.test import lldbutil - +from lldbsuite.test_event.build_exception import BuildError class AsanTestCase(TestBase): @skipIfFreeBSD # llvm.org/pr21136 runtimes not yet available by default @expectedFailureNetBSD @skipUnlessAddressSanitizer def test(self): - self.build() + self.build(make_targets=["asan"]) self.asan_tests() + @skipIf(oslist=no_match(["macosx"])) + def test_libsanitizers_asan(self): + try: + self.build(make_targets=["libsanitizers"]) + except BuildError as e: + self.skipTest("failed to build with libsanitizers") + self.libsanitizer_tests() + def setUp(self): # Call super's setUp(). TestBase.setUp(self) @@ -26,6 +34,68 @@ def setUp(self): self.line_free = line_number("main.c", "// free line") self.line_breakpoint = line_number("main.c", "// break line") + # Test line numbers: rdar://126237493 + def libsanitizer_tests(self): + target = self.createTestTarget() + + self.runCmd( + "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0" + ) + + self.runCmd("run") + + # In libsanitizers, memory history is not supported until a report has been generated + self.expect( + "thread list", + "Process should be stopped due to ASan report", + substrs=["stopped", "stop reason = Use of deallocated memory"], + ) + + # test the 'memory history' command + self.expect( + "memory history 'pointer'", + substrs=[ + "Memory deallocated by Thread", + "a.out`f2", + "main.c", + "Memory allocated by Thread", + "a.out`f1", + "main.c", + ], + ) + + # do the same using SB API + process = self.dbg.GetSelectedTarget().process + val = ( + process.GetSelectedThread().GetSelectedFrame().EvaluateExpression("pointer") + ) + addr = val.GetValueAsUnsigned() + threads = process.GetHistoryThreads(addr) + self.assertEqual(threads.GetSize(), 2) + + history_thread = threads.GetThreadAtIndex(0) + self.assertTrue(history_thread.num_frames >= 2) + self.assertEqual( + history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), + "main.c", + ) + + history_thread = threads.GetThreadAtIndex(1) + self.assertTrue(history_thread.num_frames >= 2) + self.assertEqual( + history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), + "main.c", + ) + + # let's free the container (SBThreadCollection) and see if the + # SBThreads still live + threads = None + self.assertTrue(history_thread.num_frames >= 2) + self.assertEqual( + history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(), + "main.c", + ) + def asan_tests(self): target = self.createTestTarget() diff --git a/lldb/test/API/functionalities/asan/TestReportData.py b/lldb/test/API/functionalities/asan/TestReportData.py index 543c5fe66a208d..5e4c179e2a4819 100644 --- a/lldb/test/API/functionalities/asan/TestReportData.py +++ b/lldb/test/API/functionalities/asan/TestReportData.py @@ -8,7 +8,7 @@ from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil - +from lldbsuite.test_event.build_exception import BuildError class AsanTestReportDataCase(TestBase): @skipIfFreeBSD # llvm.org/pr21136 runtimes not yet available by default @@ -16,9 +16,17 @@ class AsanTestReportDataCase(TestBase): @skipUnlessAddressSanitizer @skipIf(archs=["i386"], bugnumber="llvm.org/PR36710") def test(self): - self.build() + self.build(make_targets=["asan"]) self.asan_tests() + @skipIf(oslist=no_match(["macosx"])) + def test_libsanitizers_asan(self): + try: + self.build(make_targets=["libsanitizers"]) + except BuildError as e: + self.skipTest("failed to build with libsanitizers") + self.asan_tests(libsanitizers=True) + def setUp(self): # Call super's setUp(). TestBase.setUp(self) @@ -29,10 +37,15 @@ def setUp(self): self.line_crash = line_number("main.c", "// BOOM line") self.col_crash = 16 - def asan_tests(self): + def asan_tests(self, libsanitizers=False): target = self.createTestTarget() - self.registerSanitizerLibrariesWithTarget(target) + if libsanitizers: + self.runCmd( + "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0" + ) + else: + self.registerSanitizerLibrariesWithTarget(target) self.runCmd("run") From 50a371795bcfe0731f8882e42712dff33cbbef9b Mon Sep 17 00:00:00 2001 From: darkbuck Date: Tue, 16 Apr 2024 19:10:11 -0400 Subject: [PATCH 184/300] [X86] Fix instr desc of CFCMOV's 'mr' variants - With the memory operand as the destination, 'mr' variants of CFCMOV works like STORE and their memory operands should be input operands instead of output ones. Reviewers: XinWang10, arsenm Pull Request: https://github.com/llvm/llvm-project/pull/88970 --- llvm/lib/Target/X86/X86InstrCMovSetCC.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 27a0c889a4da3e..e27aa4115990e9 100644 --- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -58,8 +58,8 @@ let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in { } let SchedRW = [WriteCMOV, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault], Predicates = [HasCMOV, HasCF, In64BitMode], mayStore = 1 in - def mr : ITy<0x40, MRMDestMemCC, t, (outs t.MemOperand:$dst), - (ins t.RegClass:$src1, ccode:$cond), + def mr : ITy<0x40, MRMDestMemCC, t, (outs), + (ins t.MemOperand:$dst, t.RegClass:$src1, ccode:$cond), "cfcmov${cond}", unaryop_ndd_args, []>, UseEFLAGS, NF; } From 1bc092181bf50d6be95b165e91bd906710710ca7 Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Tue, 16 Apr 2024 19:17:27 -0400 Subject: [PATCH 185/300] [bazel] Add support for lldb-server (#88989) --- .../llvm-project-overlay/lldb/BUILD.bazel | 73 ++++++++++++++++++- .../lldb/source/Plugins/BUILD.bazel | 19 +++++ 2 files changed, 90 insertions(+), 2 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel index 6dfe8085b92857..1f2b5b476bcc11 100644 --- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel @@ -702,6 +702,9 @@ cc_library( "//lldb/source/Plugins:PluginSymbolLocatorDebugSymbols", "//lldb/source/Plugins:PluginSymbolVendorMacOSX", ], + "@platforms//os:linux": [ + "//lldb/source/Plugins:PluginProcessLinux", + ], "//conditions:default": [], }), ) @@ -752,7 +755,13 @@ cc_binary( data = [ ":lldb-argdumper", ] + select({ - "@platforms//os:macos": [":debugserver"], + "@platforms//os:macos": [ + ":debugserver", + ":lldb-server", + ], + "@platforms//os:linux": [ + ":lldb-server", + ], "//conditions:default": [], }), deps = [ @@ -799,8 +808,8 @@ cc_library( ["tools/debugserver/source/**/*.cpp"], exclude = ["tools/debugserver/source/debugserver.cpp"], ), - tags = ["nobuildkite"], local_defines = ["LLDB_USE_OS_LOG"], + tags = ["nobuildkite"], deps = [ ":DebugServerCommonHeaders", ":DebugServerCommonMacOSXHeaders", @@ -852,3 +861,63 @@ cc_binary( srcs = glob(["tools/argdumper/*.cpp"]), deps = ["//llvm:Support"], ) + +gentbl_cc_library( + name = "lldb_server_opts_gen", + strip_include_prefix = ".", + tbl_outs = [( + ["-gen-opt-parser-defs"], + "LLGSOptions.inc", + )], + tblgen = "//llvm:llvm-tblgen", + td_file = "tools/lldb-server/LLGSOptions.td", + deps = ["//llvm:OptParserTdFiles"], +) + +cc_binary( + name = "lldb-server", + srcs = glob([ + "tools/lldb-server/*.cpp", + "tools/lldb-server/*.h", + ]), + target_compatible_with = select({ + "@platforms//os:linux": [], + "@platforms//os:macos": [], + # TODO: This can theoretically support more platforms, but it hasn't been tested yet + "//conditions:default": ["@platforms//:incompatible"], + }), + deps = [ + ":Host", + ":Initialization", + ":Utility", + ":Version", + ":lldb_server_opts_gen", + "//lldb:Target", + "//lldb:TargetHeaders", + "//lldb/source/Plugins:PluginCPlusPlusLanguage", + "//lldb/source/Plugins:PluginExpressionParserClang", + "//lldb/source/Plugins:PluginInstructionARM", + "//lldb/source/Plugins:PluginInstructionARM64", + "//lldb/source/Plugins:PluginInstructionLoongArch", + "//lldb/source/Plugins:PluginInstructionMIPS", + "//lldb/source/Plugins:PluginInstructionMIPS64", + "//lldb/source/Plugins:PluginInstructionRISCV", + "//lldb/source/Plugins:PluginObjCLanguage", + "//lldb/source/Plugins:PluginProcessGDBRemote", + "//lldb/source/Plugins:PluginSymbolFileDWARF", + "//lldb/source/Plugins:PluginSymbolFileNativePDB", + "//lldb/source/Plugins:PluginSymbolFilePDB", + "//lldb/source/Plugins:PluginTypeSystemClang", + "//llvm:Option", + "//llvm:Support", + ] + select({ + "@platforms//os:linux": [ + "//lldb/source/Plugins:PluginObjectFileELF", + "//lldb/source/Plugins:PluginProcessLinux", + ], + "@platforms//os:macos": [ + "//lldb/source/Plugins:PluginObjectFileMachO", + ], + "//conditions:default": [], + }), +) diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index bbc523f54a190d..b5f5bed1698a6b 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -2100,6 +2100,25 @@ cc_library( ], ) +cc_library( + name = "PluginProcessLinux", + srcs = glob(["Process/Linux/*.cpp"]), + hdrs = glob(["Process/Linux/*.h"]), + include_prefix = "Plugins", + deps = [ + ":PluginProcessPOSIX", + ":PluginProcessUtility", + "//lldb:Core", + "//lldb:Headers", + "//lldb:Host", + "//lldb:SymbolHeaders", + "//lldb:TargetHeaders", + "//lldb:Utility", + "//llvm:Support", + "//llvm:TargetParser", + ], +) + cc_library( name = "PluginScriptedProcess", srcs = glob(["Process/scripted/*.cpp"]), From be50a259f1fe77240b000f6b695b9b6394f4936b Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Tue, 16 Apr 2024 16:22:31 -0700 Subject: [PATCH 186/300] Update foldFMulReassoc to respect absent fast-math flags (#88589) This change updates a few of the transformations in foldFMulReassoc to respect absent fast-math flags in cases where fmul and fdiv, fadd, or fsub instructions were being folded but the code was only checking for fast-math flags on the fmul instruction and was transferring flags to the folded instruction that were not present on the other original instructions. This fixes https://github.com/llvm/llvm-project/issues/82857 --- llvm/include/llvm/IR/InstrTypes.h | 27 +++++ .../InstCombine/InstCombineMulDivRem.cpp | 38 +++++-- llvm/test/Transforms/InstCombine/fast-math.ll | 4 +- llvm/test/Transforms/InstCombine/fmul-pow.ll | 30 ++--- llvm/test/Transforms/InstCombine/fmul.ll | 104 ++++++++++++------ 5 files changed, 141 insertions(+), 62 deletions(-) diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index cfe1b11ade5a4e..8e6bef69218c2b 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -24,6 +24,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/FMF.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/LLVMContext.h" @@ -311,6 +312,32 @@ class BinaryOperator : public Instruction { return BO; } + static BinaryOperator *CreateWithFMF(BinaryOps Opc, Value *V1, Value *V2, + FastMathFlags FMF, + const Twine &Name = "", + Instruction *InsertBefore = nullptr) { + BinaryOperator *BO = Create(Opc, V1, V2, Name, InsertBefore); + BO->setFastMathFlags(FMF); + return BO; + } + + static BinaryOperator *CreateFAddFMF(Value *V1, Value *V2, FastMathFlags FMF, + const Twine &Name = "") { + return CreateWithFMF(Instruction::FAdd, V1, V2, FMF, Name); + } + static BinaryOperator *CreateFSubFMF(Value *V1, Value *V2, FastMathFlags FMF, + const Twine &Name = "") { + return CreateWithFMF(Instruction::FSub, V1, V2, FMF, Name); + } + static BinaryOperator *CreateFMulFMF(Value *V1, Value *V2, FastMathFlags FMF, + const Twine &Name = "") { + return CreateWithFMF(Instruction::FMul, V1, V2, FMF, Name); + } + static BinaryOperator *CreateFDivFMF(Value *V1, Value *V2, FastMathFlags FMF, + const Twine &Name = "") { + return CreateWithFMF(Instruction::FDiv, V1, V2, FMF, Name); + } + static BinaryOperator *CreateFAddFMF(Value *V1, Value *V2, Instruction *FMFSource, const Twine &Name = "") { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 48372381a0d1cd..7b86fcde8937ba 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -624,31 +624,38 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) { Value *Op1 = I.getOperand(1); Value *X, *Y; Constant *C; + BinaryOperator *Op0BinOp; // Reassociate constant RHS with another constant to form constant // expression. - if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) { + if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP() && + match(Op0, m_AllowReassoc(m_BinOp(Op0BinOp)))) { + // Everything in this scope folds I with Op0, intersecting their FMF. + FastMathFlags FMF = I.getFastMathFlags() & Op0BinOp->getFastMathFlags(); + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + Builder.setFastMathFlags(FMF); Constant *C1; if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) { // (C1 / X) * C --> (C * C1) / X Constant *CC1 = ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL); if (CC1 && CC1->isNormalFP()) - return BinaryOperator::CreateFDivFMF(CC1, X, &I); + return BinaryOperator::CreateFDivFMF(CC1, X, FMF); } if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) { + // FIXME: This seems like it should also be checking for arcp // (X / C1) * C --> X * (C / C1) Constant *CDivC1 = ConstantFoldBinaryOpOperands(Instruction::FDiv, C, C1, DL); if (CDivC1 && CDivC1->isNormalFP()) - return BinaryOperator::CreateFMulFMF(X, CDivC1, &I); + return BinaryOperator::CreateFMulFMF(X, CDivC1, FMF); // If the constant was a denormal, try reassociating differently. // (X / C1) * C --> X / (C1 / C) Constant *C1DivC = ConstantFoldBinaryOpOperands(Instruction::FDiv, C1, C, DL); if (C1DivC && Op0->hasOneUse() && C1DivC->isNormalFP()) - return BinaryOperator::CreateFDivFMF(X, C1DivC, &I); + return BinaryOperator::CreateFDivFMF(X, C1DivC, FMF); } // We do not need to match 'fadd C, X' and 'fsub X, C' because they are @@ -658,26 +665,33 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) { // (X + C1) * C --> (X * C) + (C * C1) if (Constant *CC1 = ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL)) { - Value *XC = Builder.CreateFMulFMF(X, C, &I); - return BinaryOperator::CreateFAddFMF(XC, CC1, &I); + Value *XC = Builder.CreateFMul(X, C); + return BinaryOperator::CreateFAddFMF(XC, CC1, FMF); } } if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) { // (C1 - X) * C --> (C * C1) - (X * C) if (Constant *CC1 = ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL)) { - Value *XC = Builder.CreateFMulFMF(X, C, &I); - return BinaryOperator::CreateFSubFMF(CC1, XC, &I); + Value *XC = Builder.CreateFMul(X, C); + return BinaryOperator::CreateFSubFMF(CC1, XC, FMF); } } } Value *Z; if (match(&I, - m_c_FMul(m_OneUse(m_FDiv(m_Value(X), m_Value(Y))), m_Value(Z)))) { - // Sink division: (X / Y) * Z --> (X * Z) / Y - Value *NewFMul = Builder.CreateFMulFMF(X, Z, &I); - return BinaryOperator::CreateFDivFMF(NewFMul, Y, &I); + m_c_FMul(m_AllowReassoc(m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))), + m_Value(Z)))) { + BinaryOperator *DivOp = cast(((Z == Op0) ? Op1 : Op0)); + FastMathFlags FMF = I.getFastMathFlags() & DivOp->getFastMathFlags(); + if (FMF.allowReassoc()) { + // Sink division: (X / Y) * Z --> (X * Z) / Y + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + Builder.setFastMathFlags(FMF); + auto *NewFMul = Builder.CreateFMul(X, Z); + return BinaryOperator::CreateFDivFMF(NewFMul, Y, FMF); + } } // sqrt(X) * sqrt(Y) -> sqrt(X * Y) diff --git a/llvm/test/Transforms/InstCombine/fast-math.ll b/llvm/test/Transforms/InstCombine/fast-math.ll index 129d7811cfb867..916955e34efacb 100644 --- a/llvm/test/Transforms/InstCombine/fast-math.ll +++ b/llvm/test/Transforms/InstCombine/fast-math.ll @@ -562,7 +562,7 @@ define float @fdiv1(float %x) { ; CHECK-NEXT: [[DIV1:%.*]] = fmul fast float [[X:%.*]], 0x3FD7303B60000000 ; CHECK-NEXT: ret float [[DIV1]] ; - %div = fdiv float %x, 0x3FF3333340000000 + %div = fdiv fast float %x, 0x3FF3333340000000 %div1 = fdiv fast float %div, 0x4002666660000000 ret float %div1 ; 0x3FF3333340000000 = 1.2f @@ -603,7 +603,7 @@ define float @fdiv3(float %x) { ; CHECK-NEXT: [[DIV1:%.*]] = fdiv fast float [[TMP1]], 0x47EFFFFFE0000000 ; CHECK-NEXT: ret float [[DIV1]] ; - %div = fdiv float %x, 0x47EFFFFFE0000000 + %div = fdiv fast float %x, 0x47EFFFFFE0000000 %div1 = fdiv fast float %div, 0x4002666660000000 ret float %div1 } diff --git a/llvm/test/Transforms/InstCombine/fmul-pow.ll b/llvm/test/Transforms/InstCombine/fmul-pow.ll index 63458e136074c9..84592d220d62c4 100644 --- a/llvm/test/Transforms/InstCombine/fmul-pow.ll +++ b/llvm/test/Transforms/InstCombine/fmul-pow.ll @@ -85,8 +85,8 @@ define double @pow_ab_recip_a_reassoc(double %a, double %b) { ; CHECK-NEXT: [[M:%.*]] = call reassoc double @llvm.pow.f64(double [[A:%.*]], double [[TMP1]]) ; CHECK-NEXT: ret double [[M]] ; - %r = fdiv double 1.0, %a - %p = call double @llvm.pow.f64(double %a, double %b) + %r = fdiv reassoc double 1.0, %a + %p = call reassoc double @llvm.pow.f64(double %a, double %b) %m = fmul reassoc double %r, %p ret double %m } @@ -99,8 +99,8 @@ define double @pow_ab_recip_a_reassoc_commute(double %a, double %b) { ; CHECK-NEXT: [[M:%.*]] = call reassoc double @llvm.pow.f64(double [[A:%.*]], double [[TMP1]]) ; CHECK-NEXT: ret double [[M]] ; - %r = fdiv double 1.0, %a - %p = call double @llvm.pow.f64(double %a, double %b) + %r = fdiv reassoc double 1.0, %a + %p = call reassoc double @llvm.pow.f64(double %a, double %b) %m = fmul reassoc double %p, %r ret double %m } @@ -109,14 +109,14 @@ define double @pow_ab_recip_a_reassoc_commute(double %a, double %b) { define double @pow_ab_recip_a_reassoc_use1(double %a, double %b) { ; CHECK-LABEL: @pow_ab_recip_a_reassoc_use1( -; CHECK-NEXT: [[R:%.*]] = fdiv double 1.000000e+00, [[A:%.*]] -; CHECK-NEXT: [[P:%.*]] = call double @llvm.pow.f64(double [[A]], double [[B:%.*]]) +; CHECK-NEXT: [[R:%.*]] = fdiv reassoc double 1.000000e+00, [[A:%.*]] +; CHECK-NEXT: [[P:%.*]] = call reassoc double @llvm.pow.f64(double [[A]], double [[B:%.*]]) ; CHECK-NEXT: [[M:%.*]] = fmul reassoc double [[R]], [[P]] ; CHECK-NEXT: call void @use(double [[R]]) ; CHECK-NEXT: ret double [[M]] ; - %r = fdiv double 1.0, %a - %p = call double @llvm.pow.f64(double %a, double %b) + %r = fdiv reassoc double 1.0, %a + %p = call reassoc double @llvm.pow.f64(double %a, double %b) %m = fmul reassoc double %r, %p call void @use(double %r) ret double %m @@ -126,13 +126,13 @@ define double @pow_ab_recip_a_reassoc_use1(double %a, double %b) { define double @pow_ab_recip_a_reassoc_use2(double %a, double %b) { ; CHECK-LABEL: @pow_ab_recip_a_reassoc_use2( -; CHECK-NEXT: [[P:%.*]] = call double @llvm.pow.f64(double [[A:%.*]], double [[B:%.*]]) +; CHECK-NEXT: [[P:%.*]] = call reassoc double @llvm.pow.f64(double [[A:%.*]], double [[B:%.*]]) ; CHECK-NEXT: [[M:%.*]] = fdiv reassoc double [[P]], [[A]] ; CHECK-NEXT: call void @use(double [[P]]) ; CHECK-NEXT: ret double [[M]] ; - %r = fdiv double 1.0, %a - %p = call double @llvm.pow.f64(double %a, double %b) + %r = fdiv reassoc double 1.0, %a + %p = call reassoc double @llvm.pow.f64(double %a, double %b) %m = fmul reassoc double %r, %p call void @use(double %p) ret double %m @@ -142,15 +142,15 @@ define double @pow_ab_recip_a_reassoc_use2(double %a, double %b) { define double @pow_ab_recip_a_reassoc_use3(double %a, double %b) { ; CHECK-LABEL: @pow_ab_recip_a_reassoc_use3( -; CHECK-NEXT: [[R:%.*]] = fdiv double 1.000000e+00, [[A:%.*]] -; CHECK-NEXT: [[P:%.*]] = call double @llvm.pow.f64(double [[A]], double [[B:%.*]]) +; CHECK-NEXT: [[R:%.*]] = fdiv reassoc double 1.000000e+00, [[A:%.*]] +; CHECK-NEXT: [[P:%.*]] = call reassoc double @llvm.pow.f64(double [[A]], double [[B:%.*]]) ; CHECK-NEXT: [[M:%.*]] = fmul reassoc double [[R]], [[P]] ; CHECK-NEXT: call void @use(double [[R]]) ; CHECK-NEXT: call void @use(double [[P]]) ; CHECK-NEXT: ret double [[M]] ; - %r = fdiv double 1.0, %a - %p = call double @llvm.pow.f64(double %a, double %b) + %r = fdiv reassoc double 1.0, %a + %p = call reassoc double @llvm.pow.f64(double %a, double %b) %m = fmul reassoc double %r, %p call void @use(double %r) call void @use(double %p) diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll index f6435f0032891e..39f9e74f899d18 100644 --- a/llvm/test/Transforms/InstCombine/fmul.ll +++ b/llvm/test/Transforms/InstCombine/fmul.ll @@ -633,15 +633,15 @@ define float @log2half(float %x, float %y) { define float @log2half_commute(float %x1, float %y) { ; CHECK-LABEL: @log2half_commute( +; CHECK-NEXT: [[X1:%.*]] = fmul fast float [[X2:%.*]], 0x3FC24924A0000000 ; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.log2.f32(float [[Y:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[TMP1]], [[X1:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast float [[TMP1]], [[X1]] ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast float [[TMP2]], [[X1]] -; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP3]], 0x3FC24924A0000000 -; CHECK-NEXT: ret float [[MUL]] +; CHECK-NEXT: ret float [[TMP3]] ; - %x = fdiv float %x1, 7.0 ; thwart complexity-based canonicalization - %halfy = fmul float %y, 0.5 - %log2 = call float @llvm.log2.f32(float %halfy) + %x = fdiv fast float %x1, 7.0 ; thwart complexity-based canonicalization + %halfy = fmul fast float %y, 0.5 + %log2 = call fast float @llvm.log2.f32(float %halfy) %mul = fmul fast float %x, %log2 ret float %mul } @@ -652,12 +652,50 @@ define float @fdiv_constant_numerator_fmul(float %x) { ; CHECK-LABEL: @fdiv_constant_numerator_fmul( ; CHECK-NEXT: [[T3:%.*]] = fdiv reassoc float 1.200000e+07, [[X:%.*]] ; CHECK-NEXT: ret float [[T3]] +; + %t1 = fdiv reassoc float 2.0e+3, %x + %t3 = fmul reassoc float %t1, 6.0e+3 + ret float %t3 +} + +; C1/X * C2 => (C1*C2) / X with mixed fast-math flags + +define float @fdiv_constant_numerator_fmul_mixed(float %x) { +; CHECK-LABEL: @fdiv_constant_numerator_fmul_mixed( +; CHECK-NEXT: [[T3:%.*]] = fdiv reassoc float 1.200000e+07, [[X:%.*]] +; CHECK-NEXT: ret float [[T3]] +; + %t1 = fdiv reassoc float 2.0e+3, %x + %t3 = fmul fast float %t1, 6.0e+3 + ret float %t3 +} + +; C1/X * C2 => (C1*C2) / X with full fast-math flags + +define float @fdiv_constant_numerator_fmul_fast(float %x) { +; CHECK-LABEL: @fdiv_constant_numerator_fmul_fast( +; CHECK-NEXT: [[T3:%.*]] = fdiv fast float 1.200000e+07, [[X:%.*]] +; CHECK-NEXT: ret float [[T3]] +; + %t1 = fdiv fast float 2.0e+3, %x + %t3 = fmul fast float %t1, 6.0e+3 + ret float %t3 +} + +; C1/X * C2 => (C1*C2) / X with no fast-math flags on the fdiv + +define float @fdiv_constant_numerator_fmul_precdiv(float %x) { +; CHECK-LABEL: @fdiv_constant_numerator_fmul_precdiv( +; CHECK-NEXT: [[T1:%.*]] = fdiv float 2.000000e+03, [[X:%.*]] +; CHECK-NEXT: [[T4:%.*]] = fmul reassoc float [[T1]], 6.000000e+03 +; CHECK-NEXT: ret float [[T4]] ; %t1 = fdiv float 2.0e+3, %x %t3 = fmul reassoc float %t1, 6.0e+3 ret float %t3 } + ; C1/X * C2 => (C1*C2) / X is disabled if C1/X has multiple uses @fmul2_external = external global float @@ -682,7 +720,7 @@ define float @fdiv_constant_denominator_fmul(float %x) { ; CHECK-NEXT: [[T3:%.*]] = fmul reassoc float [[X:%.*]], 3.000000e+00 ; CHECK-NEXT: ret float [[T3]] ; - %t1 = fdiv float %x, 2.0e+3 + %t1 = fdiv reassoc float %x, 2.0e+3 %t3 = fmul reassoc float %t1, 6.0e+3 ret float %t3 } @@ -692,7 +730,7 @@ define <4 x float> @fdiv_constant_denominator_fmul_vec(<4 x float> %x) { ; CHECK-NEXT: [[T3:%.*]] = fmul reassoc <4 x float> [[X:%.*]], ; CHECK-NEXT: ret <4 x float> [[T3]] ; - %t1 = fdiv <4 x float> %x, + %t1 = fdiv reassoc <4 x float> %x, %t3 = fmul reassoc <4 x float> %t1, ret <4 x float> %t3 } @@ -705,7 +743,7 @@ define <4 x float> @fdiv_constant_denominator_fmul_vec_constexpr(<4 x float> %x) ; CHECK-NEXT: ret <4 x float> [[T3]] ; %constExprMul = bitcast i128 trunc (i160 bitcast (<5 x float> to i160) to i128) to <4 x float> - %t1 = fdiv <4 x float> %x, + %t1 = fdiv reassoc <4 x float> %x, %t3 = fmul reassoc <4 x float> %t1, %constExprMul ret <4 x float> %t3 } @@ -734,7 +772,7 @@ define float @fdiv_constant_denominator_fmul_denorm(float %x) { ; CHECK-NEXT: [[T3:%.*]] = fmul fast float [[X:%.*]], 0x3760620000000000 ; CHECK-NEXT: ret float [[T3]] ; - %t1 = fdiv float %x, 2.0e+3 + %t1 = fdiv fast float %x, 2.0e+3 %t3 = fmul fast float %t1, 0x3810000000000000 ret float %t3 } @@ -748,7 +786,7 @@ define float @fdiv_constant_denominator_fmul_denorm_try_harder(float %x) { ; CHECK-NEXT: [[T3:%.*]] = fdiv reassoc float [[X:%.*]], 0x47E8000000000000 ; CHECK-NEXT: ret float [[T3]] ; - %t1 = fdiv float %x, 3.0 + %t1 = fdiv reassoc float %x, 3.0 %t3 = fmul reassoc float %t1, 0x3810000000000000 ret float %t3 } @@ -776,7 +814,7 @@ define float @fmul_fadd_distribute(float %x) { ; CHECK-NEXT: [[T3:%.*]] = fadd reassoc float [[TMP1]], 6.000000e+00 ; CHECK-NEXT: ret float [[T3]] ; - %t2 = fadd float %x, 2.0 + %t2 = fadd reassoc float %x, 2.0 %t3 = fmul reassoc float %t2, 3.0 ret float %t3 } @@ -787,7 +825,7 @@ define <2 x float> @fmul_fadd_distribute_vec(<2 x float> %x) { ; CHECK-NEXT: [[T3:%.*]] = fadd reassoc <2 x float> [[TMP1]], ; CHECK-NEXT: ret <2 x float> [[T3]] ; - %t1 = fadd <2 x float> , %x + %t1 = fadd reassoc <2 x float> , %x %t3 = fmul reassoc <2 x float> %t1, ret <2 x float> %t3 } @@ -798,7 +836,7 @@ define @fmul_fadd_distribute_scalablevec( [[TMP1]], shufflevector ( insertelement ( poison, float 1.200000e+07, i64 0), poison, zeroinitializer) ; CHECK-NEXT: ret [[T3]] ; - %t1 = fadd splat (float 2.0e+3), %x + %t1 = fadd reassoc splat (float 2.0e+3), %x %t3 = fmul reassoc %t1, splat (float 6.0e+3) @@ -813,7 +851,7 @@ define float @fmul_fsub_distribute1(float %x) { ; CHECK-NEXT: [[T3:%.*]] = fadd reassoc float [[TMP1]], -6.000000e+00 ; CHECK-NEXT: ret float [[T3]] ; - %t2 = fsub float %x, 2.0 + %t2 = fsub reassoc float %x, 2.0 %t3 = fmul reassoc float %t2, 3.0 ret float %t3 } @@ -826,7 +864,7 @@ define float @fmul_fsub_distribute2(float %x) { ; CHECK-NEXT: [[T3:%.*]] = fsub reassoc float 6.000000e+00, [[TMP1]] ; CHECK-NEXT: ret float [[T3]] ; - %t2 = fsub float 2.0, %x + %t2 = fsub reassoc float 2.0, %x %t3 = fmul reassoc float %t2, 3.0 ret float %t3 } @@ -840,8 +878,8 @@ define float @fmul_fadd_fmul_distribute(float %x) { ; CHECK-NEXT: [[T3:%.*]] = fadd fast float [[TMP1]], 1.000000e+01 ; CHECK-NEXT: ret float [[T3]] ; - %t1 = fmul float %x, 6.0 - %t2 = fadd float %t1, 2.0 + %t1 = fmul fast float %x, 6.0 + %t2 = fadd fast float %t1, 2.0 %t3 = fmul fast float %t2, 5.0 ret float %t3 } @@ -872,8 +910,8 @@ define double @fmul_fadd_fdiv_distribute2(double %x) { ; CHECK-NEXT: [[T3:%.*]] = fadd reassoc double [[TMP1]], 0x34000000000000 ; CHECK-NEXT: ret double [[T3]] ; - %t1 = fdiv double %x, 3.0 - %t2 = fadd double %t1, 5.0 + %t1 = fdiv reassoc double %x, 3.0 + %t2 = fadd reassoc double %t1, 5.0 %t3 = fmul reassoc double %t2, 0x10000000000000 ret double %t3 } @@ -887,8 +925,8 @@ define double @fmul_fadd_fdiv_distribute3(double %x) { ; CHECK-NEXT: [[T3:%.*]] = fadd reassoc double [[TMP1]], 0x34000000000000 ; CHECK-NEXT: ret double [[T3]] ; - %t1 = fdiv double %x, 3.0 - %t2 = fadd double %t1, 5.0 + %t1 = fdiv reassoc double %x, 3.0 + %t2 = fadd reassoc double %t1, 5.0 %t3 = fmul reassoc double %t2, 0x10000000000000 ret double %t3 } @@ -902,8 +940,8 @@ define float @fmul_fsub_fmul_distribute(float %x) { ; CHECK-NEXT: [[T3:%.*]] = fsub fast float 1.000000e+01, [[TMP1]] ; CHECK-NEXT: ret float [[T3]] ; - %t1 = fmul float %x, 6.0 - %t2 = fsub float 2.0, %t1 + %t1 = fmul fast float %x, 6.0 + %t2 = fsub fast float 2.0, %t1 %t3 = fmul fast float %t2, 5.0 ret float %t3 } @@ -932,8 +970,8 @@ define float @fmul_fsub_fmul_distribute2(float %x) { ; CHECK-NEXT: [[T3:%.*]] = fadd fast float [[TMP1]], -1.000000e+01 ; CHECK-NEXT: ret float [[T3]] ; - %t1 = fmul float %x, 6.0 - %t2 = fsub float %t1, 2.0 + %t1 = fmul fast float %x, 6.0 + %t2 = fsub fast float %t1, 2.0 %t3 = fmul fast float %t2, 5.0 ret float %t3 } @@ -986,8 +1024,8 @@ define double @fmul_fdivs_factor_common_denominator(double %x, double %y, double ; CHECK-NEXT: [[MUL:%.*]] = fdiv fast double [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret double [[MUL]] ; - %div1 = fdiv double %x, %z - %div2 = fdiv double %y, %z + %div1 = fdiv fast double %x, %z + %div2 = fdiv fast double %y, %z %mul = fmul fast double %div1, %div2 ret double %mul } @@ -999,8 +1037,8 @@ define double @fmul_fdivs_factor(double %x, double %y, double %z, double %w) { ; CHECK-NEXT: [[MUL:%.*]] = fdiv reassoc double [[TMP2]], [[Y:%.*]] ; CHECK-NEXT: ret double [[MUL]] ; - %div1 = fdiv double %x, %y - %div2 = fdiv double %z, %w + %div1 = fdiv reassoc double %x, %y + %div2 = fdiv reassoc double %z, %w %mul = fmul reassoc double %div1, %div2 ret double %mul } @@ -1011,7 +1049,7 @@ define double @fmul_fdiv_factor(double %x, double %y, double %z) { ; CHECK-NEXT: [[MUL:%.*]] = fdiv reassoc double [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: ret double [[MUL]] ; - %div = fdiv double %x, %y + %div = fdiv reassoc double %x, %y %mul = fmul reassoc double %div, %z ret double %mul } @@ -1022,7 +1060,7 @@ define double @fmul_fdiv_factor_constant1(double %x, double %y) { ; CHECK-NEXT: [[MUL:%.*]] = fdiv reassoc double [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: ret double [[MUL]] ; - %div = fdiv double %x, %y + %div = fdiv reassoc double %x, %y %mul = fmul reassoc double %div, 42.0 ret double %mul } @@ -1033,7 +1071,7 @@ define <2 x float> @fmul_fdiv_factor_constant2(<2 x float> %x, <2 x float> %y) { ; CHECK-NEXT: [[MUL:%.*]] = fdiv reassoc <2 x float> [[TMP1]], ; CHECK-NEXT: ret <2 x float> [[MUL]] ; - %div = fdiv <2 x float> %x, + %div = fdiv reassoc <2 x float> %x, %mul = fmul reassoc <2 x float> %div, %y ret <2 x float> %mul } From ce5381e22a50f354cf3d1763589f1daf155c481b Mon Sep 17 00:00:00 2001 From: Prashant Kumar Date: Wed, 17 Apr 2024 05:36:40 +0530 Subject: [PATCH 187/300] =?UTF-8?q?[mlir][vector]=20Determine=20vector=20s?= =?UTF-8?q?izes=20from=20the=20result=20shape=20in=20the=20ca=E2=80=A6=20(?= =?UTF-8?q?#88249)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …se of tensor pack When the vector sizes are not passed as inputs to the vector transform operation, the vector sizes are queried from the static result shape in the case of tensor.pack op. --- .../Linalg/Transforms/Vectorization.cpp | 54 +++++++++++++----- .../Linalg/vectorization-unsupported.mlir | 17 ++++++ mlir/test/Dialect/Linalg/vectorization.mlir | 55 +++++++++++++++++++ 3 files changed, 113 insertions(+), 13 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 25785653a71675..df61381432921b 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1412,10 +1412,11 @@ static SmallVector getTiledPackShape(tensor::PackOp packOp, /// Create a TransferReadOp from `source` with static shape `readShape`. If the /// vector type for the read is not the same as the type of `source`, then a -/// mask is created on the read. +/// mask is created on the read. If `doMasking` parameter is set to false we +/// update the `inBounds` attribute instead of masking. static Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source, ArrayRef readShape, - Value padValue) { + Value padValue, bool doMasking = true) { assert(llvm::none_of(readShape, [](int64_t s) { return s == ShapedType::kDynamic; })); auto sourceShape = dyn_cast(source.getType()).getShape(); @@ -1424,14 +1425,21 @@ static Value createReadOrMaskedRead(OpBuilder &builder, Location loc, auto vectorType = VectorType::get(readShape, padValue.getType()); int64_t readRank = readShape.size(); auto zero = builder.create(loc, 0); + SmallVector inBoundsVal(readRank, true); + if (!doMasking) { + // Update the inBounds attribute. + for (unsigned i = 0; i < readRank; i++) + inBoundsVal[i] = sourceShape[i] == readShape[i]; + } auto transferReadOp = builder.create( loc, /*vectorType=*/vectorType, /*source=*/source, /*indices=*/SmallVector(readRank, zero), /*padding=*/padValue, - /*inBounds=*/SmallVector(readRank, true)); - if (llvm::equal(readShape, sourceShape)) { + /*inBounds=*/inBoundsVal); + + if (llvm::equal(readShape, sourceShape) || !doMasking) { return transferReadOp; } SmallVector mixedSourceDims = @@ -1482,11 +1490,10 @@ static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc, return write; } -/// Vectorize tensor::PackOp with (1) static innerTiles and (2) constant -/// padding value into: +/// Vectorize tensor::PackOp with (1) static innerTiles (2) constant +/// padding value and (3) input vector sizes into: /// masked_transfer_read->shape_cast->transpose->transfer_write_in_bounds /// As in the following example: -/// /// %pack = tensor.pack %src inner_dims_pos = [2, 1] inner_tiles = [16, 2] /// into %dst : tensor<32x8x16xf32> -> tensor<32x4x1x16x2xf32> /// @@ -1505,6 +1512,10 @@ static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc, /// %empty[%c0_0, %c0_0, %c0_0, %c0_0, %c0_0] /// {in_bounds = [true, true, true, true, true]} /// : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32> +/// +/// If the (3) input vector sizes are not provided, the vector sizes are +/// determined by the result tensor shape. Also, we update the inBounds +/// attribute instead of masking. static LogicalResult vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, ArrayRef inputVectorSizes, @@ -1525,6 +1536,16 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, (void)status; // prevent unused variable warning on non-assert builds. assert(succeeded(status) && "failed to reify result shapes"); + // If the input vector sizes are not provided, then the vector sizes are + // determined by the result tensor shape. In case the vector sizes aren't + // provided, we update the inBounds attribute instead of masking. + bool doMasking = true; + if (inputVectorSizes.empty()) { + ArrayRef resultTensorShape = packOp.getDestType().getShape(); + inputVectorSizes = resultTensorShape.take_front(packOp.getSourceRank()); + doMasking = false; + } + // Create masked TransferReadOp. SmallVector inputShape(inputVectorSizes); auto innerTiles = packOp.getStaticInnerTiles(); @@ -1536,7 +1557,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp, for (auto [idx, size] : enumerate(innerTiles)) inputShape[innerDimsPos[idx]] *= size; auto maskedRead = createReadOrMaskedRead(rewriter, loc, packOp.getSource(), - inputShape, padValue); + inputShape, padValue, doMasking); // Create ShapeCastOp. SmallVector destShape(inputVectorSizes); @@ -1763,7 +1784,7 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op, /// Returns success if `inputVectorSizes` is a valid masking configuraion for /// given `shape`, i.e., it meets: /// 1. The numbers of elements in both array are equal. -/// 2. `inputVectorSizes` does nos have dynamic dimensions. +/// 2. `inputVectorSizes` does not have dynamic dimensions. /// 3. All the values in `inputVectorSizes` are greater than or equal to /// static sizes in `shape`. static LogicalResult @@ -1881,18 +1902,25 @@ static LogicalResult vectorizeLinalgOpPrecondition( return success(); } -/// TODO: Use a matcher to check for a constant padding value. static LogicalResult vectorizePackOpPrecondition(tensor::PackOp packOp, ArrayRef inputVectorSizes) { auto padValue = packOp.getPaddingValue(); - if (padValue && !padValue.getDefiningOp()) { + Attribute cstAttr; + if (padValue && !matchPattern(padValue, m_Constant(&cstAttr))) { LDBG("pad value is not constant: " << packOp << "\n"); return failure(); } - ArrayRef resultTensorShape = packOp.getDestType().getShape(); - if (failed(isValidMaskedInputVector( + bool satisfyEmptyCond = true; + if (inputVectorSizes.empty()) { + if (!packOp.getDestType().hasStaticShape() || + !packOp.getSourceType().hasStaticShape()) + satisfyEmptyCond = false; + } + + if (!satisfyEmptyCond && + failed(isValidMaskedInputVector( resultTensorShape.take_front(packOp.getSourceRank()), inputVectorSizes))) return failure(); diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir index 9127eac5da9510..5d3c07c8e23c1e 100644 --- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir @@ -109,3 +109,20 @@ module attributes {transform.with_named_sequence} { transform.yield } } + + // ----- + +func.func @test_pack_no_vectorize_dynamic_shape(%arg0: tensor, %arg1: tensor<4x16xf32>) -> tensor<4x16xf32> { + %pad = arith.constant 0.000000e+00 : f32 + // expected-error @+1 {{Attempted to vectorize, but failed}} + %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [0] inner_tiles = [16] into %arg1 : tensor -> tensor<4x16xf32> + return %pack : tensor<4x16xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 : !transform.any_op + transform.yield + } +} diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir index fd7d3b4767eb22..80a5a4c6702ac1 100644 --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -930,3 +930,58 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: transform.yield } } + + // ----- + +// CHECK-LABEL: test_vectorize_pack_no_vector_sizes +func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> { + %pack = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32> + return %pack : tensor<2x4x16x2xf32> +} +// CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index +// CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]]], %[[cst]] +// CHECK-SAME: {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32> +// CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<64x4xf32> to vector<4x16x2x2xf32> +// CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32> +// CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<2x4x16x2xf32> +// CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]] +// CHECK-SAME: {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32> +// CHECK: return %[[write]] : tensor<2x4x16x2xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 : !transform.any_op + transform.yield + } +} + + // ----- + +// CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes +func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> { + %pad = arith.constant 0.000000e+00 : f32 + %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32> + return %pack : tensor<32x4x1x16x2xf32> +} +// CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index +// CHECK: %[[transfer_read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]] +// CHECK-SAME: {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32> +// CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[transfer_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> +// CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32> +// CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32> +// CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]] +// CHECK-SAME: {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32> +// CHECK: return %[[write]] : tensor<32x4x1x16x2xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 : !transform.any_op + transform.yield + } +} From 8c9f45e2decbb68dbf83794f98291b53f59390f8 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Tue, 16 Apr 2024 17:08:02 -0700 Subject: [PATCH 188/300] [ARM64EC] Fix arm_neon.h on ARM64EC. (#88572) Since 97fe519d, in ARM64EC mode, we don't define `__aarch64__`. Fix various preprocessor guards to account for this. --- clang/include/clang/Basic/arm_fp16.td | 2 +- clang/include/clang/Basic/arm_neon.td | 58 +++++++++++++-------------- clang/utils/TableGen/NeonEmitter.cpp | 8 ++-- 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/clang/include/clang/Basic/arm_fp16.td b/clang/include/clang/Basic/arm_fp16.td index cb2a09303e8e12..d36b4617bef5d2 100644 --- a/clang/include/clang/Basic/arm_fp16.td +++ b/clang/include/clang/Basic/arm_fp16.td @@ -14,7 +14,7 @@ include "arm_neon_incl.td" // ARMv8.2-A FP16 intrinsics. -let ArchGuard = "defined(__aarch64__)", TargetGuard = "fullfp16" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fullfp16" in { // Negate def VNEGSH : SInst<"vneg", "11", "Sh">; diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 7edac5afafaa99..6d655c39360d3b 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -605,11 +605,11 @@ def VQDMULL_LANE : SOpInst<"vqdmull_lane", "(>Q)..I", "si", OP_QDMULL_LN>; def VQDMULH_N : SOpInst<"vqdmulh_n", "..1", "siQsQi", OP_QDMULH_N>; def VQRDMULH_N : SOpInst<"vqrdmulh_n", "..1", "siQsQi", OP_QRDMULH_N>; -let ArchGuard = "!defined(__aarch64__)" in { +let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)" in { def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>; def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>; } -let ArchGuard = "defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in { def A64_VQDMULH_LANE : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi">; def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi">; } @@ -686,7 +686,7 @@ multiclass REINTERPRET_CROSS_TYPES { // E.3.31 Vector reinterpret cast operations def VREINTERPRET : REINTERPRET_CROSS_SELF<"csilUcUsUiUlhfPcPsQcQsQiQlQUcQUsQUiQUlQhQfQPcQPs"> { - let ArchGuard = "!defined(__aarch64__)"; + let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)"; let BigEndianSafe = 1; } @@ -714,7 +714,7 @@ def VADDP : WInst<"vadd", "...", "PcPsPlQPcQPsQPl">; //////////////////////////////////////////////////////////////////////////////// // AArch64 Intrinsics -let ArchGuard = "defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in { //////////////////////////////////////////////////////////////////////////////// // Load/Store @@ -1091,14 +1091,14 @@ let isLaneQ = 1 in { def VQDMULH_LANEQ : SInst<"vqdmulh_laneq", "..QI", "siQsQi">; def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">; } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.1a" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a" in { def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN> { let isLaneQ = 1; } def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN> { let isLaneQ = 1; } -} // ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.1a" +} // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a" // Note: d type implemented by SCALAR_VMULX_LANE def VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "fQfQd", OP_MULX_LN>; @@ -1143,7 +1143,7 @@ def SHA256H2 : SInst<"vsha256h2", "....", "QUi">; def SHA256SU1 : SInst<"vsha256su1", "....", "QUi">; } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "sha3" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3" in { def BCAX : SInst<"vbcax", "....", "QUcQUsQUiQUlQcQsQiQl">; def EOR3 : SInst<"veor3", "....", "QUcQUsQUiQUlQcQsQiQl">; def RAX1 : SInst<"vrax1", "...", "QUl">; @@ -1153,14 +1153,14 @@ def XAR : SInst<"vxar", "...I", "QUl">; } } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "sha3" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3" in { def SHA512SU0 : SInst<"vsha512su0", "...", "QUl">; def SHA512su1 : SInst<"vsha512su1", "....", "QUl">; def SHA512H : SInst<"vsha512h", "....", "QUl">; def SHA512H2 : SInst<"vsha512h2", "....", "QUl">; } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "sm4" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4" in { def SM3SS1 : SInst<"vsm3ss1", "....", "QUi">; def SM3TT1A : SInst<"vsm3tt1a", "....I", "QUi">; def SM3TT1B : SInst<"vsm3tt1b", "....I", "QUi">; @@ -1170,7 +1170,7 @@ def SM3PARTW1 : SInst<"vsm3partw1", "....", "QUi">; def SM3PARTW2 : SInst<"vsm3partw2", "....", "QUi">; } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "sm4" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4" in { def SM4E : SInst<"vsm4e", "...", "QUi">; def SM4EKEY : SInst<"vsm4ekey", "...", "QUi">; } @@ -1193,7 +1193,7 @@ def FCVTAS_S32 : SInst<"vcvta_s32", "S.", "fQf">; def FCVTAU_S32 : SInst<"vcvta_u32", "U.", "fQf">; } -let ArchGuard = "defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in { def FCVTNS_S64 : SInst<"vcvtn_s64", "S.", "dQd">; def FCVTNU_S64 : SInst<"vcvtn_u64", "U.", "dQd">; def FCVTPS_S64 : SInst<"vcvtp_s64", "S.", "dQd">; @@ -1217,7 +1217,7 @@ def FRINTZ_S32 : SInst<"vrnd", "..", "fQf">; def FRINTI_S32 : SInst<"vrndi", "..", "fQf">; } -let ArchGuard = "defined(__aarch64__) && defined(__ARM_FEATURE_DIRECTED_ROUNDING)" in { +let ArchGuard = "(defined(__aarch64__) || defined(__arm64ec__)) && defined(__ARM_FEATURE_DIRECTED_ROUNDING)" in { def FRINTN_S64 : SInst<"vrndn", "..", "dQd">; def FRINTA_S64 : SInst<"vrnda", "..", "dQd">; def FRINTP_S64 : SInst<"vrndp", "..", "dQd">; @@ -1227,7 +1227,7 @@ def FRINTZ_S64 : SInst<"vrnd", "..", "dQd">; def FRINTI_S64 : SInst<"vrndi", "..", "dQd">; } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.5a" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.5a" in { def FRINT32X_S32 : SInst<"vrnd32x", "..", "fQf">; def FRINT32Z_S32 : SInst<"vrnd32z", "..", "fQf">; def FRINT64X_S32 : SInst<"vrnd64x", "..", "fQf">; @@ -1247,7 +1247,7 @@ def FMAXNM_S32 : SInst<"vmaxnm", "...", "fQf">; def FMINNM_S32 : SInst<"vminnm", "...", "fQf">; } -let ArchGuard = "defined(__aarch64__) && defined(__ARM_FEATURE_NUMERIC_MAXMIN)" in { +let ArchGuard = "(defined(__aarch64__) || defined(__arm64ec__)) && defined(__ARM_FEATURE_NUMERIC_MAXMIN)" in { def FMAXNM_S64 : SInst<"vmaxnm", "...", "dQd">; def FMINNM_S64 : SInst<"vminnm", "...", "dQd">; } @@ -1289,7 +1289,7 @@ def VQTBX4_A64 : WInst<"vqtbx4", "..(4Q)U", "UccPcQUcQcQPc">; // itself during generation so, unlike all other intrinsics, this one should // include *all* types, not just additional ones. def VVREINTERPRET : REINTERPRET_CROSS_SELF<"csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk"> { - let ArchGuard = "defined(__aarch64__)"; + let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)"; let BigEndianSafe = 1; } @@ -1401,7 +1401,7 @@ def SCALAR_SQDMULH : SInst<"vqdmulh", "111", "SsSi">; // Scalar Integer Saturating Rounding Doubling Multiply Half High def SCALAR_SQRDMULH : SInst<"vqrdmulh", "111", "SsSi">; -let ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.1a" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a" in { //////////////////////////////////////////////////////////////////////////////// // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half def SCALAR_SQRDMLAH : SInst<"vqrdmlah", "1111", "SsSi">; @@ -1409,7 +1409,7 @@ def SCALAR_SQRDMLAH : SInst<"vqrdmlah", "1111", "SsSi">; //////////////////////////////////////////////////////////////////////////////// // Signed Saturating Rounding Doubling Multiply Subtract Returning High Half def SCALAR_SQRDMLSH : SInst<"vqrdmlsh", "1111", "SsSi">; -} // ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.1a" +} // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a" //////////////////////////////////////////////////////////////////////////////// // Scalar Floating-point Multiply Extended @@ -1651,7 +1651,7 @@ def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcS let isLaneQ = 1; } -} // ArchGuard = "defined(__aarch64__)" +} // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" // ARMv8.2-A FP16 vector intrinsics for A32/A64. let TargetGuard = "fullfp16" in { @@ -1775,7 +1775,7 @@ def VEXTH : WInst<"vext", "...I", "hQh">; def VREV64H : WOpInst<"vrev64", "..", "hQh", OP_REV64>; // ARMv8.2-A FP16 vector intrinsics for A64 only. -let ArchGuard = "defined(__aarch64__)", TargetGuard = "fullfp16" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fullfp16" in { // Vector rounding def FRINTIH : SInst<"vrndi", "..", "hQh">; @@ -1856,7 +1856,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fullfp16" in { def FMINNMVH : SInst<"vminnmv", "1.", "hQh">; } -let ArchGuard = "defined(__aarch64__)" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in { // Permutation def VTRN1H : SOpInst<"vtrn1", "...", "hQh", OP_TRN1>; def VZIP1H : SOpInst<"vzip1", "...", "hQh", OP_ZIP1>; @@ -1876,7 +1876,7 @@ let TargetGuard = "dotprod" in { def DOT : SInst<"vdot", "..(<<)(<<)", "iQiUiQUi">; def DOT_LANE : SOpInst<"vdot_lane", "..(<<)(<; } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "dotprod" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "dotprod" in { // Variants indexing into a 128-bit vector are A64 only. def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(< { let isLaneQ = 1; @@ -1884,7 +1884,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "dotprod" in { } // v8.2-A FP16 fused multiply-add long instructions. -let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp16fml" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fp16fml" in { def VFMLAL_LOW : SInst<"vfmlal_low", ">>..", "hQh">; def VFMLSL_LOW : SInst<"vfmlsl_low", ">>..", "hQh">; def VFMLAL_HIGH : SInst<"vfmlal_high", ">>..", "hQh">; @@ -1918,7 +1918,7 @@ let TargetGuard = "i8mm" in { def VUSDOT_LANE : SOpInst<"vusdot_lane", "..(<; def VSUDOT_LANE : SOpInst<"vsudot_lane", "..(<<)(<; - let ArchGuard = "defined(__aarch64__)" in { + let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in { let isLaneQ = 1 in { def VUSDOT_LANEQ : SOpInst<"vusdot_laneq", "..(<; def VSUDOT_LANEQ : SOpInst<"vsudot_laneq", "..(<<)(<; @@ -1986,7 +1986,7 @@ let TargetGuard = "v8.3a" in { defm VCMLA_F32 : VCMLA_ROTS<"f", "uint64x1_t", "uint64x2_t">; } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.3a" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.3a" in { def VCADDQ_ROT90_FP64 : SInst<"vcaddq_rot90", "QQQ", "d">; def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">; @@ -2058,14 +2058,14 @@ let TargetGuard = "bf16" in { def SCALAR_CVT_F32_BF16 : SOpInst<"vcvtah_f32", "(1F>)(1!)", "b", OP_CVT_F32_BF16>; } -let ArchGuard = "!defined(__aarch64__)", TargetGuard = "bf16" in { +let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16" in { def VCVT_BF16_F32_A32_INTERNAL : WInst<"__a32_vcvt_bf16", "BQ", "f">; def VCVT_BF16_F32_A32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A32>; def VCVT_LOW_BF16_F32_A32 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A32>; def VCVT_HIGH_BF16_F32_A32 : SOpInst<"vcvt_high_bf16", "BBQ", "Qf", OP_VCVT_BF16_F32_HI_A32>; } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "bf16" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16" in { def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">; def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>; def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">; @@ -2077,14 +2077,14 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "bf16" in { def COPYQ_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..I.I", "Qb", OP_COPY_LN>; } -let ArchGuard = "!defined(__aarch64__)", TargetGuard = "bf16" in { +let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16" in { let BigEndianSafe = 1 in { defm VREINTERPRET_BF : REINTERPRET_CROSS_TYPES< "csilUcUsUiUlhfPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQPcQPsQPl", "bQb">; } } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "bf16" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16" in { let BigEndianSafe = 1 in { defm VVREINTERPRET_BF : REINTERPRET_CROSS_TYPES< "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", "bQb">; @@ -2092,7 +2092,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "bf16" in { } // v8.9a/v9.4a LRCPC3 intrinsics -let ArchGuard = "defined(__aarch64__)", TargetGuard = "rcpc3" in { +let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "rcpc3" in { def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">; def VSTL1_LANE : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">; } diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 04e1acc2705004..56f1fdf9ef574f 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -2266,7 +2266,7 @@ static void emitNeonTypeDefs(const std::string& types, raw_ostream &OS) { InIfdef = false; } if (!InIfdef && IsA64) { - OS << "#ifdef __aarch64__\n"; + OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n"; InIfdef = true; } @@ -2299,7 +2299,7 @@ static void emitNeonTypeDefs(const std::string& types, raw_ostream &OS) { InIfdef = false; } if (!InIfdef && IsA64) { - OS << "#ifdef __aarch64__\n"; + OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n"; InIfdef = true; } @@ -2381,7 +2381,7 @@ void NeonEmitter::run(raw_ostream &OS) { OS << "#include \n"; // For now, signedness of polynomial types depends on target - OS << "#ifdef __aarch64__\n"; + OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n"; OS << "typedef uint8_t poly8_t;\n"; OS << "typedef uint16_t poly16_t;\n"; OS << "typedef uint64_t poly64_t;\n"; @@ -2582,7 +2582,7 @@ void NeonEmitter::runVectorTypes(raw_ostream &OS) { OS << "typedef float float32_t;\n"; OS << "typedef __fp16 float16_t;\n"; - OS << "#ifdef __aarch64__\n"; + OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n"; OS << "typedef double float64_t;\n"; OS << "#endif\n\n"; From 8c9d814b66f7df274de41225575817188fbeed4f Mon Sep 17 00:00:00 2001 From: Kai Sasaki Date: Wed, 17 Apr 2024 09:19:15 +0900 Subject: [PATCH 189/300] [mlir][complex] Fastmath flag for complex angle (#88658) See https://discourse.llvm.org/t/rfc-fastmath-flags-support-in-complex-dialect/71981 --- .../ComplexToStandard/ComplexToStandard.cpp | 3 ++- .../ComplexToStandard/convert-to-standard.mlir | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp index 03e578136e5901..4a15976d40c763 100644 --- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp +++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp @@ -1289,13 +1289,14 @@ struct AngleOpConversion : public OpConversionPattern { ConversionPatternRewriter &rewriter) const override { auto loc = op.getLoc(); auto type = op.getType(); + arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr(); Value real = rewriter.create(loc, type, adaptor.getComplex()); Value imag = rewriter.create(loc, type, adaptor.getComplex()); - rewriter.replaceOpWithNewOp(op, imag, real); + rewriter.replaceOpWithNewOp(op, imag, real, fmf); return success(); } diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir index fa1d564d6ad355..827ae940165c7e 100644 --- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir +++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir @@ -2187,3 +2187,16 @@ func.func @complex_tanh_nnan_ninf(%arg: complex) -> complex { // CHECK-COUNT-1: arith.select // CHECK-NOT: arith.select + +// ----- + +// CHECK-LABEL: func.func @complex_angle_with_fmf +// CHECK-SAME: %[[ARG:.*]]: complex +func.func @complex_angle_with_fmf(%arg: complex) -> f32 { + %angle = complex.angle %arg fastmath : complex + return %angle : f32 +} +// CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex +// CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex +// CHECK: %[[RESULT:.*]] = math.atan2 %[[IMAG]], %[[REAL]] fastmath : f32 +// CHECK: return %[[RESULT]] : f32 \ No newline at end of file From efd60556f759fbfa0fc0a5984463daeaef20799c Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 17 Apr 2024 09:22:57 +0900 Subject: [PATCH 190/300] Revert "[SLP]Attempt to vectorize long stores, if short one failed." This reverts commit 7d4e8c1f3bbfe976f4871c9cf953f76d771b0eda. Contrary to the commit description, this does cause large compile-time regressions (up to 10% on individual files). --- .../Transforms/Vectorize/SLPVectorizer.cpp | 81 +++++++------------ .../Transforms/SLPVectorizer/X86/pr46983.ll | 46 ++++++++--- 2 files changed, 65 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 8ae38550d3095d..7694627c3b0430 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15237,60 +15237,39 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, Size *= 2; }); unsigned StartIdx = 0; - unsigned Repeat = 0; - constexpr unsigned MaxAttempts = 2; - while (true) { - ++Repeat; - for (unsigned Size : CandidateVFs) { - for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { - ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); - assert( - all_of( - Slice, - [&](Value *V) { - return cast(V)->getValueOperand()->getType() == - cast(Slice.front()) - ->getValueOperand() - ->getType(); - }) && - "Expected all operands of same type."); - if (!VectorizedStores.count(Slice.front()) && - !VectorizedStores.count(Slice.back()) && - TriedSequences - .insert(std::make_pair(Slice.front(), Slice.back())) - .second && - vectorizeStoreChain(Slice, R, Cnt, MinVF)) { - // Mark the vectorized stores so that we don't vectorize them - // again. - VectorizedStores.insert(Slice.begin(), Slice.end()); - Changed = true; - // If we vectorized initial block, no need to try to vectorize - // it again. - if (Cnt == StartIdx) - StartIdx += Size; - Cnt += Size; - continue; - } - ++Cnt; - } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= Operands.size()) { - Repeat = MaxAttempts; - break; + for (unsigned Size : CandidateVFs) { + for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { + ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); + assert( + all_of( + Slice, + [&](Value *V) { + return cast(V)->getValueOperand()->getType() == + cast(Slice.front()) + ->getValueOperand() + ->getType(); + }) && + "Expected all operands of same type."); + if (!VectorizedStores.count(Slice.front()) && + !VectorizedStores.count(Slice.back()) && + TriedSequences.insert(std::make_pair(Slice.front(), Slice.back())) + .second && + vectorizeStoreChain(Slice, R, Cnt, MinVF)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Slice.begin(), Slice.end()); + Changed = true; + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += Size; + Cnt += Size; + continue; } + ++Cnt; } - // Check if tried all attempts or no need for the last attempts at all. - if (Repeat >= MaxAttempts) - break; - const unsigned MaxTotalNum = bit_floor(Operands.size() - StartIdx); - if (MaxVF >= MaxTotalNum) + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Operands.size()) break; - // Last attempt to vectorize max number of elements, if all previous - // attempts were unsuccessful because of the cost issues. - CandidateVFs.clear(); - for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2) { - CandidateVFs.push_back(Size); - } } } }; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll index 3deab0975ce764..75505f632a43f3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -100,17 +100,41 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) { define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) { ; SSE-LABEL: @store_i64( ; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] -; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer -; SSE-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] -; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], -; SSE-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> -; SSE-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], -; SSE-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> -; SSE-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> -; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64> -; SSE-NEXT: store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] +; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] +; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 +; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 +; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 +; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 +; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 +; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] +; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 +; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 +; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 +; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 +; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 +; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16 +; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] +; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 +; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 +; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 +; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 +; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24 +; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] +; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 +; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 +; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 +; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 +; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 +; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @store_i64( From 7c2688977567ea5ac1203daa3c452b541ef55f67 Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Tue, 16 Apr 2024 15:20:14 -0400 Subject: [PATCH 191/300] [Sparc] Fix instr desc of special register stores - Those special register stores are STORE and their memory operands are input operands instead of output ones. Reviewers: JDevlieghere, arsenm, yinying-lisa-li, koachan, PeimingLiu, jyknight, aartbik, matthias-springer Reviewed By: arsenm Pull Request: https://github.com/llvm/llvm-project/pull/88971 --- llvm/lib/Target/Sparc/SparcInstrInfo.td | 30 ++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td index 5e792427cca282..4d68f93efeac17 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.td +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td @@ -693,38 +693,38 @@ let DecoderNamespace = "SparcV8", Predicates = [HasNoV9] in { } let rd = 0 in { - let Defs = [CPSR] in { - def STCSRrr : F3_1<3, 0b110101, (outs (MEMrr $rs1, $rs2):$addr), (ins), + let mayStore = 1, Uses = [CPSR] in { + def STCSRrr : F3_1<3, 0b110101, (outs), (ins (MEMrr $rs1, $rs2):$addr), "st %csr, [$addr]", [], IIC_st>; - def STCSRri : F3_2<3, 0b110101, (outs (MEMri $rs1, $simm13):$addr), (ins), + def STCSRri : F3_2<3, 0b110101, (outs), (ins (MEMri $rs1, $simm13):$addr), "st %csr, [$addr]", [], IIC_st>; } - let Defs = [CPQ] in { - def STDCQrr : F3_1<3, 0b110110, (outs (MEMrr $rs1, $rs2):$addr), (ins), + let mayStore = 1, Uses = [CPQ] in { + def STDCQrr : F3_1<3, 0b110110, (outs), (ins (MEMrr $rs1, $rs2):$addr), "std %cq, [$addr]", [], IIC_std>; - def STDCQri : F3_2<3, 0b110110, (outs (MEMri $rs1, $simm13):$addr), (ins), + def STDCQri : F3_2<3, 0b110110, (outs), (ins (MEMri $rs1, $simm13):$addr), "std %cq, [$addr]", [], IIC_std>; } } let rd = 0 in { - let Defs = [FSR] in { - def STFSRrr : F3_1<3, 0b100101, (outs (MEMrr $rs1, $rs2):$addr), (ins), + let mayStore = 1, Uses = [FSR] in { + def STFSRrr : F3_1<3, 0b100101, (outs), (ins (MEMrr $rs1, $rs2):$addr), "st %fsr, [$addr]", [], IIC_st>; - def STFSRri : F3_2<3, 0b100101, (outs (MEMri $rs1, $simm13):$addr), (ins), + def STFSRri : F3_2<3, 0b100101, (outs), (ins (MEMri $rs1, $simm13):$addr), "st %fsr, [$addr]", [], IIC_st>; } - let Defs = [FQ] in { - def STDFQrr : F3_1<3, 0b100110, (outs (MEMrr $rs1, $rs2):$addr), (ins), + let mayStore = 1, Defs = [FQ] in { + def STDFQrr : F3_1<3, 0b100110, (outs), (ins (MEMrr $rs1, $rs2):$addr), "std %fq, [$addr]", [], IIC_std>; - def STDFQri : F3_2<3, 0b100110, (outs (MEMri $rs1, $simm13):$addr), (ins), + def STDFQri : F3_2<3, 0b100110, (outs), (ins (MEMri $rs1, $simm13):$addr), "std %fq, [$addr]", [], IIC_std>; } } -let rd = 1, Defs = [FSR] in { - def STXFSRrr : F3_1<3, 0b100101, (outs (MEMrr $rs1, $rs2):$addr), (ins), +let rd = 1, mayStore = 1, Uses = [FSR] in { + def STXFSRrr : F3_1<3, 0b100101, (outs), (ins (MEMrr $rs1, $rs2):$addr), "stx %fsr, [$addr]", []>, Requires<[HasV9]>; - def STXFSRri : F3_2<3, 0b100101, (outs (MEMri $rs1, $simm13):$addr), (ins), + def STXFSRri : F3_2<3, 0b100101, (outs), (ins (MEMri $rs1, $simm13):$addr), "stx %fsr, [$addr]", []>, Requires<[HasV9]>; } From 62853a246ef131c4de73b63a141c85a0b14c75a5 Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Tue, 16 Apr 2024 15:20:32 -0400 Subject: [PATCH 192/300] [TableGen][InstrInfoEmitter] Count sub-operands on def operands - If a def operand includes multiple sub-operands, count them when generating instr info. - Found issues in x86 and sparc backends, where memory operands of store or store-like instructions are wrongly placed in the output list. Reviewers: jayfoad, arsenm, Pierre-vh Reviewed By: arsenm Pull Request: https://github.com/llvm/llvm-project/pull/88972 --- llvm/test/TableGen/def-multiple-operands.td | 37 +++++++++++++++++++++ llvm/utils/TableGen/InstrInfoEmitter.cpp | 8 ++++- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 llvm/test/TableGen/def-multiple-operands.td diff --git a/llvm/test/TableGen/def-multiple-operands.td b/llvm/test/TableGen/def-multiple-operands.td new file mode 100644 index 00000000000000..b747c58907505a --- /dev/null +++ b/llvm/test/TableGen/def-multiple-operands.td @@ -0,0 +1,37 @@ +// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s | FileCheck %s + +include "llvm/Target/Target.td" + +def archInstrInfo : InstrInfo {} + +def arch : Target { + let InstructionSet = archInstrInfo; +} + +def R0 : Register<"r0">; +def P0 : Register<"p0">; +def R32 : RegisterClass<"MyNS", [i32], 0, (add R0)>; +def P1 : RegisterClass<"MyNS", [i1], 0, (add P0)>; + +def Reg3Opnd : Operand { + let MIOperandInfo = (ops R32, R32, P1); +} + +// The following checks verify that 'MCInstrDesc' entry for 'InstA' has the +// expected 'NumOperands' and 'NumDefs', i.e. 'InstA' should have 3 defs out of +// 4 operands. + +// CHECK: archInstrTable {{.* = \{}} +// CHECK: {{\{}} +// CHECK: {{\{}} [[ID:[0-9]+]], 4, 3, 13, {{.+\}, \/\/}} +// CHECK-SAME: Inst #[[ID]] = InstA +def InstA : Instruction { + let Namespace = "MyNS"; + let Size = 13; + // InstA should have 3 defs out of 4 operands. + let OutOperandList = (outs Reg3Opnd:$dst); + let InOperandList = (ins i32imm:$c); + field bits<8> Inst; + field bits<8> SoftFail = 0; + let hasSideEffects = false; +} diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index 36f8fa14653938..b3a05e081f6375 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -1181,9 +1181,15 @@ void InstrInfoEmitter::emitRecord( // Each logical operand can be multiple MI operands. MinOperands = Inst.Operands.back().MIOperandNo + Inst.Operands.back().MINumOperands; + // Even the logical output operand may be multiple MI operands. + int DefOperands = 0; + if (Inst.Operands.NumDefs) { + auto &Opnd = Inst.Operands[Inst.Operands.NumDefs - 1]; + DefOperands = Opnd.MIOperandNo + Opnd.MINumOperands; + } OS << " { "; - OS << Num << ",\t" << MinOperands << ",\t" << Inst.Operands.NumDefs << ",\t" + OS << Num << ",\t" << MinOperands << ",\t" << DefOperands << ",\t" << Inst.TheDef->getValueAsInt("Size") << ",\t" << SchedModels.getSchedClassIdx(Inst) << ",\t"; From d0f718e06848774a4e9d0b253cf75c1408b5f41a Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 16 Apr 2024 17:47:17 -0700 Subject: [PATCH 193/300] Revert "Improve stack usage to increase recursive initialization depth" (#89006) Reverts llvm/llvm-project#88546 Leak and performance regression. Details in #88546 --- clang/docs/ReleaseNotes.rst | 6 -- clang/include/clang/Sema/Initialization.h | 6 +- clang/include/clang/Sema/Overload.h | 70 +++++++++++++++++------ clang/lib/Sema/SemaInit.cpp | 26 ++++----- clang/lib/Sema/SemaOverload.cpp | 21 ++++--- 5 files changed, 75 insertions(+), 54 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 4aedfafcb26aea..3752b6ce157600 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -203,12 +203,6 @@ Non-comprehensive list of changes in this release - ``__typeof_unqual__`` is available in all C modes as an extension, which behaves like ``typeof_unqual`` from C23, similar to ``__typeof__`` and ``typeof``. -- Improved stack usage with C++ initialization code. This allows significantly - more levels of recursive initialization before reaching stack exhaustion - limits. This will positively impact recursive template instantiation code, - but should also reduce memory overhead for initializations in general. - Fixes #GH88330 - New Compiler Flags ------------------ - ``-fsanitize=implicit-bitfield-conversion`` checks implicit truncation and diff --git a/clang/include/clang/Sema/Initialization.h b/clang/include/clang/Sema/Initialization.h index 1ceacf0f49f568..2072cd8d1c3ef8 100644 --- a/clang/include/clang/Sema/Initialization.h +++ b/clang/include/clang/Sema/Initialization.h @@ -1134,7 +1134,7 @@ class InitializationSequence { OverloadingResult FailedOverloadResult; /// The candidate set created when initialization failed. - std::unique_ptr FailedCandidateSet; + OverloadCandidateSet FailedCandidateSet; /// The incomplete type that caused a failure. QualType FailedIncompleteType; @@ -1403,9 +1403,7 @@ class InitializationSequence { /// Retrieve a reference to the candidate set when overload /// resolution fails. OverloadCandidateSet &getFailedCandidateSet() { - assert(FailedCandidateSet && - "this should have been allocated in the constructor!"); - return *FailedCandidateSet; + return FailedCandidateSet; } /// Get the overloading result, for when the initialization diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index e6f88bbf7c4f47..76311b00d2fc58 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -37,7 +37,6 @@ #include #include #include -#include #include namespace clang { @@ -875,8 +874,7 @@ class Sema; ConversionFixItGenerator Fix; /// Viable - True to indicate that this overload candidate is viable. - LLVM_PREFERRED_TYPE(bool) - unsigned Viable : 1; + bool Viable : 1; /// Whether this candidate is the best viable function, or tied for being /// the best viable function. @@ -885,14 +883,12 @@ class Sema; /// was part of the ambiguity kernel: the minimal non-empty set of viable /// candidates such that all elements of the ambiguity kernel are better /// than all viable candidates not in the ambiguity kernel. - LLVM_PREFERRED_TYPE(bool) - unsigned Best : 1; + bool Best : 1; /// IsSurrogate - True to indicate that this candidate is a /// surrogate for a conversion to a function pointer or reference /// (C++ [over.call.object]). - LLVM_PREFERRED_TYPE(bool) - unsigned IsSurrogate : 1; + bool IsSurrogate : 1; /// IgnoreObjectArgument - True to indicate that the first /// argument's conversion, which for this function represents the @@ -901,20 +897,18 @@ class Sema; /// implicit object argument is just a placeholder) or a /// non-static member function when the call doesn't have an /// object argument. - LLVM_PREFERRED_TYPE(bool) - unsigned IgnoreObjectArgument : 1; + bool IgnoreObjectArgument : 1; /// True if the candidate was found using ADL. - LLVM_PREFERRED_TYPE(CallExpr::ADLCallKind) - unsigned IsADLCandidate : 1; + CallExpr::ADLCallKind IsADLCandidate : 1; /// Whether this is a rewritten candidate, and if so, of what kind? LLVM_PREFERRED_TYPE(OverloadCandidateRewriteKind) unsigned RewriteKind : 2; /// FailureKind - The reason why this candidate is not viable. - LLVM_PREFERRED_TYPE(OverloadFailureKind) - unsigned FailureKind : 5; + /// Actually an OverloadFailureKind. + unsigned char FailureKind; /// The number of call arguments that were explicitly provided, /// to be used while performing partial ordering of function templates. @@ -978,9 +972,7 @@ class Sema; private: friend class OverloadCandidateSet; OverloadCandidate() - : IsSurrogate(false), - IsADLCandidate(static_cast(CallExpr::NotADL)), - RewriteKind(CRK_None) {} + : IsSurrogate(false), IsADLCandidate(CallExpr::NotADL), RewriteKind(CRK_None) {} }; /// OverloadCandidateSet - A set of overload candidates, used in C++ @@ -1078,16 +1070,51 @@ class Sema; }; private: - SmallVector Candidates; - llvm::SmallPtrSet Functions; + SmallVector Candidates; + llvm::SmallPtrSet Functions; + + // Allocator for ConversionSequenceLists. We store the first few of these + // inline to avoid allocation for small sets. + llvm::BumpPtrAllocator SlabAllocator; SourceLocation Loc; CandidateSetKind Kind; OperatorRewriteInfo RewriteInfo; + constexpr static unsigned NumInlineBytes = + 24 * sizeof(ImplicitConversionSequence); + unsigned NumInlineBytesUsed = 0; + alignas(void *) char InlineSpace[NumInlineBytes]; + // Address space of the object being constructed. LangAS DestAS = LangAS::Default; + /// If we have space, allocates from inline storage. Otherwise, allocates + /// from the slab allocator. + /// FIXME: It would probably be nice to have a SmallBumpPtrAllocator + /// instead. + /// FIXME: Now that this only allocates ImplicitConversionSequences, do we + /// want to un-generalize this? + template + T *slabAllocate(unsigned N) { + // It's simpler if this doesn't need to consider alignment. + static_assert(alignof(T) == alignof(void *), + "Only works for pointer-aligned types."); + static_assert(std::is_trivial::value || + std::is_same::value, + "Add destruction logic to OverloadCandidateSet::clear()."); + + unsigned NBytes = sizeof(T) * N; + if (NBytes > NumInlineBytes - NumInlineBytesUsed) + return SlabAllocator.Allocate(N); + char *FreeSpaceStart = InlineSpace + NumInlineBytesUsed; + assert(uintptr_t(FreeSpaceStart) % alignof(void *) == 0 && + "Misaligned storage!"); + + NumInlineBytesUsed += NBytes; + return reinterpret_cast(FreeSpaceStart); + } + void destroyCandidates(); public: @@ -1136,7 +1163,12 @@ class Sema; ConversionSequenceList allocateConversionSequences(unsigned NumConversions) { ImplicitConversionSequence *Conversions = - new ImplicitConversionSequence[NumConversions]; + slabAllocate(NumConversions); + + // Construct the new objects. + for (unsigned I = 0; I != NumConversions; ++I) + new (&Conversions[I]) ImplicitConversionSequence(); + return ConversionSequenceList(Conversions, NumConversions); } diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 791c0b6e6df23e..fb7a80ab02846c 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -6114,8 +6114,7 @@ InitializationSequence::InitializationSequence( Sema &S, const InitializedEntity &Entity, const InitializationKind &Kind, MultiExprArg Args, bool TopLevelOfInitList, bool TreatUnavailableAsInvalid) : FailedOverloadResult(OR_Success), - FailedCandidateSet(new OverloadCandidateSet( - Kind.getLocation(), OverloadCandidateSet::CSK_Normal)) { + FailedCandidateSet(Kind.getLocation(), OverloadCandidateSet::CSK_Normal) { InitializeFrom(S, Entity, Kind, Args, TopLevelOfInitList, TreatUnavailableAsInvalid); } @@ -9736,7 +9735,7 @@ bool InitializationSequence::Diagnose(Sema &S, switch (FailedOverloadResult) { case OR_Ambiguous: - FailedCandidateSet->NoteCandidates( + FailedCandidateSet.NoteCandidates( PartialDiagnosticAt( Kind.getLocation(), Failure == FK_UserConversionOverloadFailed @@ -9750,8 +9749,7 @@ bool InitializationSequence::Diagnose(Sema &S, break; case OR_No_Viable_Function: { - auto Cands = - FailedCandidateSet->CompleteCandidates(S, OCD_AllCandidates, Args); + auto Cands = FailedCandidateSet.CompleteCandidates(S, OCD_AllCandidates, Args); if (!S.RequireCompleteType(Kind.getLocation(), DestType.getNonReferenceType(), diag::err_typecheck_nonviable_condition_incomplete, @@ -9761,13 +9759,13 @@ bool InitializationSequence::Diagnose(Sema &S, << OnlyArg->getType() << Args[0]->getSourceRange() << DestType.getNonReferenceType(); - FailedCandidateSet->NoteCandidates(S, Args, Cands); + FailedCandidateSet.NoteCandidates(S, Args, Cands); break; } case OR_Deleted: { OverloadCandidateSet::iterator Best; - OverloadingResult Ovl = - FailedCandidateSet->BestViableFunction(S, Kind.getLocation(), Best); + OverloadingResult Ovl + = FailedCandidateSet.BestViableFunction(S, Kind.getLocation(), Best); StringLiteral *Msg = Best->Function->getDeletedMessage(); S.Diag(Kind.getLocation(), diag::err_typecheck_deleted_function) @@ -9951,7 +9949,7 @@ bool InitializationSequence::Diagnose(Sema &S, // bad. switch (FailedOverloadResult) { case OR_Ambiguous: - FailedCandidateSet->NoteCandidates( + FailedCandidateSet.NoteCandidates( PartialDiagnosticAt(Kind.getLocation(), S.PDiag(diag::err_ovl_ambiguous_init) << DestType << ArgsRange), @@ -10005,7 +10003,7 @@ bool InitializationSequence::Diagnose(Sema &S, break; } - FailedCandidateSet->NoteCandidates( + FailedCandidateSet.NoteCandidates( PartialDiagnosticAt( Kind.getLocation(), S.PDiag(diag::err_ovl_no_viable_function_in_init) @@ -10015,8 +10013,8 @@ bool InitializationSequence::Diagnose(Sema &S, case OR_Deleted: { OverloadCandidateSet::iterator Best; - OverloadingResult Ovl = - FailedCandidateSet->BestViableFunction(S, Kind.getLocation(), Best); + OverloadingResult Ovl + = FailedCandidateSet.BestViableFunction(S, Kind.getLocation(), Best); if (Ovl != OR_Deleted) { S.Diag(Kind.getLocation(), diag::err_ovl_deleted_init) << DestType << ArgsRange; @@ -10095,8 +10093,8 @@ bool InitializationSequence::Diagnose(Sema &S, S.Diag(Kind.getLocation(), diag::err_selected_explicit_constructor) << Args[0]->getSourceRange(); OverloadCandidateSet::iterator Best; - OverloadingResult Ovl = - FailedCandidateSet->BestViableFunction(S, Kind.getLocation(), Best); + OverloadingResult Ovl + = FailedCandidateSet.BestViableFunction(S, Kind.getLocation(), Best); (void)Ovl; assert(Ovl == OR_Success && "Inconsistent overload resolution"); CXXConstructorDecl *CtorDecl = cast(Best->Function); diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index bcde0d86cf10fd..227ef564ba3e08 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1057,7 +1057,8 @@ bool OverloadCandidateSet::OperatorRewriteInfo::shouldAddReversed( void OverloadCandidateSet::destroyCandidates() { for (iterator i = begin(), e = end(); i != e; ++i) { - delete[] i->Conversions.data(); + for (auto &C : i->Conversions) + C.~ImplicitConversionSequence(); if (!i->Viable && i->FailureKind == ovl_fail_bad_deduction) i->DeductionFailure.Destroy(); } @@ -1065,6 +1066,8 @@ void OverloadCandidateSet::destroyCandidates() { void OverloadCandidateSet::clear(CandidateSetKind CSK) { destroyCandidates(); + SlabAllocator.Reset(); + NumInlineBytesUsed = 0; Candidates.clear(); Functions.clear(); Kind = CSK; @@ -6980,7 +6983,7 @@ void Sema::AddOverloadCandidate( Candidate.RewriteKind = CandidateSet.getRewriteInfo().getRewriteKind(Function, PO); Candidate.IsSurrogate = false; - Candidate.IsADLCandidate = static_cast(IsADLCandidate); + Candidate.IsADLCandidate = IsADLCandidate; Candidate.IgnoreObjectArgument = false; Candidate.ExplicitCallArguments = Args.size(); @@ -7812,7 +7815,7 @@ void Sema::AddTemplateOverloadCandidate( Candidate.RewriteKind = CandidateSet.getRewriteInfo().getRewriteKind(Candidate.Function, PO); Candidate.IsSurrogate = false; - Candidate.IsADLCandidate = static_cast(IsADLCandidate); + Candidate.IsADLCandidate = IsADLCandidate; // Ignore the object argument if there is one, since we don't have an object // type. Candidate.IgnoreObjectArgument = @@ -14122,8 +14125,7 @@ static ExprResult FinishOverloadedCallExpr(Sema &SemaRef, Scope *S, Expr *Fn, return ExprError(); return SemaRef.BuildResolvedCallExpr( Res.get(), FDecl, LParenLoc, Args, RParenLoc, ExecConfig, - /*IsExecConfig=*/false, - static_cast((*Best)->IsADLCandidate)); + /*IsExecConfig=*/false, (*Best)->IsADLCandidate); } case OR_No_Viable_Function: { @@ -14182,8 +14184,7 @@ static ExprResult FinishOverloadedCallExpr(Sema &SemaRef, Scope *S, Expr *Fn, return ExprError(); return SemaRef.BuildResolvedCallExpr( Res.get(), FDecl, LParenLoc, Args, RParenLoc, ExecConfig, - /*IsExecConfig=*/false, - static_cast((*Best)->IsADLCandidate)); + /*IsExecConfig=*/false, (*Best)->IsADLCandidate); } } @@ -14490,8 +14491,7 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc, Args[0] = Input; CallExpr *TheCall = CXXOperatorCallExpr::Create( Context, Op, FnExpr.get(), ArgsArray, ResultTy, VK, OpLoc, - CurFPFeatureOverrides(), - static_cast(Best->IsADLCandidate)); + CurFPFeatureOverrides(), Best->IsADLCandidate); if (CheckCallReturnType(FnDecl->getReturnType(), OpLoc, TheCall, FnDecl)) return ExprError(); @@ -14909,8 +14909,7 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc, // members; CodeGen should take care not to emit the this pointer. TheCall = CXXOperatorCallExpr::Create( Context, ChosenOp, FnExpr.get(), Args, ResultTy, VK, OpLoc, - CurFPFeatureOverrides(), - static_cast(Best->IsADLCandidate)); + CurFPFeatureOverrides(), Best->IsADLCandidate); if (const auto *Method = dyn_cast(FnDecl); Method && Method->isImplicitObjectMemberFunction()) { From 1f35e7227178843679d1d364bc5fc0bcfee2eb95 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 16 Apr 2024 17:50:16 -0700 Subject: [PATCH 194/300] [clang][builtin] Implement __builtin_allow_runtime_check (#87568) RFC: https://discourse.llvm.org/t/rfc-introduce-new-clang-builtin-builtin-allow-runtime-check/78281 --------- Co-authored-by: Noah Goldstein Co-authored-by: Aaron Ballman --- clang/docs/LanguageExtensions.rst | 48 +++++++++++++++++++ clang/include/clang/Basic/Builtins.td | 6 +++ clang/lib/CodeGen/CGBuiltin.cpp | 9 ++++ clang/lib/Sema/SemaChecking.cpp | 11 +++++ .../CodeGen/builtin-allow-runtime-check.cpp | 29 +++++++++++ clang/test/Sema/builtin-allow-runtime-check.c | 24 ++++++++++ 6 files changed, 127 insertions(+) create mode 100644 clang/test/CodeGen/builtin-allow-runtime-check.cpp create mode 100644 clang/test/Sema/builtin-allow-runtime-check.c diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 05c8f765b55695..3bead159c8f946 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -3466,6 +3466,54 @@ Query for this feature with ``__has_builtin(__builtin_trap)``. ``__builtin_arm_trap`` is lowered to the ``llvm.aarch64.break`` builtin, and then to ``brk #payload``. +``__builtin_allow_runtime_check`` +--------------------------------- + +``__builtin_allow_runtime_check`` return true if the check at the current +program location should be executed. It is expected to be used to implement +``assert`` like checks which can be safely removed by optimizer. + +**Syntax**: + +.. code-block:: c++ + + bool __builtin_allow_runtime_check(const char* kind) + +**Example of use**: + +.. code-block:: c++ + + if (__builtin_allow_runtime_check("mycheck") && !ExpensiveCheck()) { + abort(); + } + +**Description** + +``__builtin_allow_runtime_check`` is lowered to ` ``llvm.allow.runtime.check`` +`_ +builtin. + +The ``__builtin_allow_runtime_check()`` is expected to be used with control +flow conditions such as in ``if`` to guard expensive runtime checks. The +specific rules for selecting permitted checks can differ and are controlled by +the compiler options. + +Flags to control checks: +* ``-mllvm -lower-allow-check-percentile-cutoff-hot=N`` where N is PGO hotness +cutoff in range ``[0, 999999]`` to disallow checks in hot code. +* ``-mllvm -lower-allow-check-random-rate=P`` where P is number in range +``[0.0, 1.0]`` representation probability of keeping a check. +* If both flags are specified, ``-lower-allow-check-random-rate`` takes +precedence. +* If none is specified, ``__builtin_allow_runtime_check`` is lowered as +``true``, allowing all checks. + +Parameter ``kind`` is a string literal representing a user selected kind for +guarded check. It's unused now. It will enable kind-specific lowering in future. +E.g. a higher hotness cutoff can be used for more expensive kind of check. + +Query for this feature with ``__has_builtin(__builtin_allow_runtime_check)``. + ``__builtin_nondeterministic_value`` ------------------------------------ diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index d6ceb450bd106b..de721a87b3341d 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -1164,6 +1164,12 @@ def Unreachable : Builtin { let Prototype = "void()"; } +def AllowRuntimeCheck : Builtin { + let Spellings = ["__builtin_allow_runtime_check"]; + let Attributes = [NoThrow, Pure, Const]; + let Prototype = "bool(char const*)"; +} + def ShuffleVector : Builtin { let Spellings = ["__builtin_shufflevector"]; let Attributes = [NoThrow, Const, CustomTypeChecking]; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 9f95697f284c40..a05874e63c73c2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -3436,6 +3436,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, Builder.CreateAssumption(ConstantInt::getTrue(getLLVMContext()), {OBD}); return RValue::get(nullptr); } + case Builtin::BI__builtin_allow_runtime_check: { + StringRef Kind = + cast(E->getArg(0)->IgnoreParenCasts())->getString(); + LLVMContext &Ctx = CGM.getLLVMContext(); + llvm::Value *Allow = Builder.CreateCall( + CGM.getIntrinsic(llvm::Intrinsic::allow_runtime_check), + llvm::MetadataAsValue::get(Ctx, llvm::MDString::get(Ctx, Kind))); + return RValue::get(Allow); + } case Builtin::BI__arithmetic_fence: { // Create the builtin call if FastMath is selected, and the target // supports the builtin, otherwise just return the argument. diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 8e21811b67d900..99b0a00083535e 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3233,6 +3233,17 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, if (BuiltinCountZeroBitsGeneric(*this, TheCall)) return ExprError(); break; + + case Builtin::BI__builtin_allow_runtime_check: { + Expr *Arg = TheCall->getArg(0); + // Check if the argument is a string literal. + if (!isa(Arg->IgnoreParenImpCasts())) { + Diag(TheCall->getBeginLoc(), diag::err_expr_not_string_literal) + << Arg->getSourceRange(); + return ExprError(); + } + break; + } } if (getLangOpts().HLSL && CheckHLSLBuiltinFunctionCall(BuiltinID, TheCall)) diff --git a/clang/test/CodeGen/builtin-allow-runtime-check.cpp b/clang/test/CodeGen/builtin-allow-runtime-check.cpp new file mode 100644 index 00000000000000..db3f59a9d48a1d --- /dev/null +++ b/clang/test/CodeGen/builtin-allow-runtime-check.cpp @@ -0,0 +1,29 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s | FileCheck %s + +static_assert(__has_builtin(__builtin_allow_runtime_check), ""); + +// CHECK-LABEL: define dso_local noundef zeroext i1 @_Z4testv( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.allow.runtime.check(metadata !"mycheck") +// CHECK-NEXT: ret i1 [[TMP0]] +// +bool test() { + return __builtin_allow_runtime_check("mycheck"); +} + +// CHECK-LABEL: define dso_local noundef zeroext i1 @_Z10test_twicev( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.allow.runtime.check(metadata !"mycheck") +// CHECK-NEXT: [[CONV:%.*]] = zext i1 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.allow.runtime.check(metadata !"mycheck") +// CHECK-NEXT: [[CONV1:%.*]] = zext i1 [[TMP1]] to i32 +// CHECK-NEXT: [[OR:%.*]] = or i32 [[CONV]], [[CONV1]] +// CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[OR]], 0 +// CHECK-NEXT: ret i1 [[TOBOOL]] +// +bool test_twice() { + return __builtin_allow_runtime_check("mycheck") | __builtin_allow_runtime_check("mycheck"); +} diff --git a/clang/test/Sema/builtin-allow-runtime-check.c b/clang/test/Sema/builtin-allow-runtime-check.c new file mode 100644 index 00000000000000..b6568610000755 --- /dev/null +++ b/clang/test/Sema/builtin-allow-runtime-check.c @@ -0,0 +1,24 @@ +// RUN: %clang_cc1 -fsyntax-only -triple x86_64-pc-linux-gnu -verify %s +// RUN: %clang_cc1 -fsyntax-only -triple aarch64-linux-gnu -verify %s + +extern const char *str; + +int main(void) { + int r = 0; + + r |= __builtin_allow_runtime_check(); // expected-error {{too few arguments to function call}} + + r |= __builtin_allow_runtime_check(str); // expected-error {{expression is not a string literal}} + + r |= __builtin_allow_runtime_check(5); // expected-error {{incompatible integer to pointer conversion}} expected-error {{expression is not a string literal}} + + r |= __builtin_allow_runtime_check("a", "b"); // expected-error {{too many arguments to function call}} + + r |= __builtin_allow_runtime_check(""); + + r |= __builtin_allow_runtime_check("check"); + + str = __builtin_allow_runtime_check("check2"); // expected-error {{incompatible integer to pointer conversion}} + + return r; +} From 52a4d8123c2a9157f2e543945f7b6148da3ecfdb Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 16 Apr 2024 17:51:32 -0700 Subject: [PATCH 195/300] [BOLT][NFC] Remove unused function (#89009) getFileOffsetFor() was replaced with getFileOffsetForAddress(). --- bolt/include/bolt/Rewrite/RewriteInstance.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h index 826677cd63b22b..7a261f611eaf26 100644 --- a/bolt/include/bolt/Rewrite/RewriteInstance.h +++ b/bolt/include/bolt/Rewrite/RewriteInstance.h @@ -368,13 +368,6 @@ class RewriteInstance { /// rewritten binary. void patchBuildID(); - /// Return file offset corresponding to a given virtual address. - uint64_t getFileOffsetFor(uint64_t Address) { - assert(Address >= NewTextSegmentAddress && - "address in not in the new text segment"); - return Address - NewTextSegmentAddress + NewTextSegmentOffset; - } - /// Return file offset corresponding to a virtual \p Address. /// Return 0 if the address has no mapping in the file, including being /// part of .bss section. From 0af8caeb2fa4d68fcabe6297383d1cdf1cae8b87 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Tue, 16 Apr 2024 17:58:47 -0700 Subject: [PATCH 196/300] [BOLT][NFC] Remove another unused function (#89011) RewriteInstance::isKSymtabSection() is deprecated. --- bolt/include/bolt/Rewrite/RewriteInstance.h | 3 --- bolt/lib/Rewrite/RewriteInstance.cpp | 7 ------- 2 files changed, 10 deletions(-) diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h index 7a261f611eaf26..af832b4c7c84cf 100644 --- a/bolt/include/bolt/Rewrite/RewriteInstance.h +++ b/bolt/include/bolt/Rewrite/RewriteInstance.h @@ -391,9 +391,6 @@ class RewriteInstance { /// Return true if the section holds debug information. static bool isDebugSection(StringRef SectionName); - /// Return true if the section holds linux kernel symbol information. - static bool isKSymtabSection(StringRef SectionName); - /// Adds Debug section to overwrite. static void addToDebugSectionsToOverwrite(const char *Section) { DebugSectionsToOverwrite.emplace_back(Section); diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index fd2477231142e3..4e0096cf988aed 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -5767,10 +5767,3 @@ bool RewriteInstance::isDebugSection(StringRef SectionName) { return false; } - -bool RewriteInstance::isKSymtabSection(StringRef SectionName) { - if (SectionName.starts_with("__ksymtab")) - return true; - - return false; -} From f40f4fcee506deacda0594362509ee7dddcf5e37 Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Wed, 17 Apr 2024 09:57:30 +0800 Subject: [PATCH 197/300] [clang analysis] ExprMutationAnalyzer support recursive forwarding reference (#88843) Reapply for #88765. Partially fixes: #60895. --- .../UnnecessaryValueParamCheck.cpp | 10 +- .../performance/UnnecessaryValueParamCheck.h | 3 +- clang-tools-extra/docs/ReleaseNotes.rst | 4 + .../misc/const-correctness-templates.cpp | 15 ++ .../Analysis/Analyses/ExprMutationAnalyzer.h | 136 ++++++++++++------ clang/lib/Analysis/ExprMutationAnalyzer.cpp | 125 +++++++++------- .../Analysis/ExprMutationAnalyzerTest.cpp | 30 ++++ 7 files changed, 220 insertions(+), 103 deletions(-) diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp index 2fa7cd0baf98f6..c507043c367a86 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp @@ -85,10 +85,10 @@ void UnnecessaryValueParamCheck::check(const MatchFinder::MatchResult &Result) { TraversalKindScope RAII(*Result.Context, TK_AsIs); - FunctionParmMutationAnalyzer &Analyzer = - MutationAnalyzers.try_emplace(Function, *Function, *Result.Context) - .first->second; - if (Analyzer.isMutated(Param)) + FunctionParmMutationAnalyzer *Analyzer = + FunctionParmMutationAnalyzer::getFunctionParmMutationAnalyzer( + *Function, *Result.Context, MutationAnalyzerCache); + if (Analyzer->isMutated(Param)) return; const bool IsConstQualified = @@ -169,7 +169,7 @@ void UnnecessaryValueParamCheck::storeOptions( } void UnnecessaryValueParamCheck::onEndOfTranslationUnit() { - MutationAnalyzers.clear(); + MutationAnalyzerCache.clear(); } void UnnecessaryValueParamCheck::handleMoveFix(const ParmVarDecl &Var, diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h index 1872e3bc9bf29c..7250bffd20b2f9 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h @@ -37,8 +37,7 @@ class UnnecessaryValueParamCheck : public ClangTidyCheck { void handleMoveFix(const ParmVarDecl &Var, const DeclRefExpr &CopyArgument, const ASTContext &Context); - llvm::DenseMap - MutationAnalyzers; + ExprMutationAnalyzer::Memoized MutationAnalyzerCache; utils::IncludeInserter Inserter; const std::vector AllowedTypes; }; diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 4dfbd8ca49ab9b..7095c564444fe6 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -221,6 +221,10 @@ Changes in existing checks ` check by replacing the local option `HeaderFileExtensions` by the global option of the same name. +- Improved :doc:`misc-const-correctness + ` check by avoiding infinite recursion + for recursive forwarding reference. + - Improved :doc:`misc-definitions-in-headers ` check by replacing the local option `HeaderFileExtensions` by the global option of the same name. diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp index 9da468128743e9..248374a71dd40b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp @@ -58,3 +58,18 @@ void concatenate3(Args... args) (..., (stream << args)); } } // namespace gh70323 + +namespace gh60895 { + +template void f1(T &&a); +template void f2(T &&a); +template void f1(T &&a) { f2(a); } +template void f2(T &&a) { f1(a); } +void f() { + int x = 0; + // CHECK-MESSAGES:[[@LINE-1]]:3: warning: variable 'x' of type 'int' can be declared 'const' + // CHECK-FIXES: int const x = 0; + f1(x); +} + +} // namespace gh60895 diff --git a/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h b/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h index 1ceef944fbc34e..117173ba9a0958 100644 --- a/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h +++ b/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h @@ -8,11 +8,9 @@ #ifndef LLVM_CLANG_ANALYSIS_ANALYSES_EXPRMUTATIONANALYZER_H #define LLVM_CLANG_ANALYSIS_ANALYSES_EXPRMUTATIONANALYZER_H -#include - -#include "clang/AST/AST.h" #include "clang/ASTMatchers/ASTMatchers.h" #include "llvm/ADT/DenseMap.h" +#include namespace clang { @@ -21,14 +19,74 @@ class FunctionParmMutationAnalyzer; /// Analyzes whether any mutative operations are applied to an expression within /// a given statement. class ExprMutationAnalyzer { + friend class FunctionParmMutationAnalyzer; + public: + struct Memoized { + using ResultMap = llvm::DenseMap; + using FunctionParaAnalyzerMap = + llvm::SmallDenseMap>; + + ResultMap Results; + ResultMap PointeeResults; + FunctionParaAnalyzerMap FuncParmAnalyzer; + + void clear() { + Results.clear(); + PointeeResults.clear(); + FuncParmAnalyzer.clear(); + } + }; + struct Analyzer { + Analyzer(const Stmt &Stm, ASTContext &Context, Memoized &Memorized) + : Stm(Stm), Context(Context), Memorized(Memorized) {} + + const Stmt *findMutation(const Expr *Exp); + const Stmt *findMutation(const Decl *Dec); + + const Stmt *findPointeeMutation(const Expr *Exp); + const Stmt *findPointeeMutation(const Decl *Dec); + static bool isUnevaluated(const Stmt *Smt, const Stmt &Stm, + ASTContext &Context); + + private: + using MutationFinder = const Stmt *(Analyzer::*)(const Expr *); + + const Stmt *findMutationMemoized(const Expr *Exp, + llvm::ArrayRef Finders, + Memoized::ResultMap &MemoizedResults); + const Stmt *tryEachDeclRef(const Decl *Dec, MutationFinder Finder); + + bool isUnevaluated(const Expr *Exp); + + const Stmt *findExprMutation(ArrayRef Matches); + const Stmt *findDeclMutation(ArrayRef Matches); + const Stmt * + findExprPointeeMutation(ArrayRef Matches); + const Stmt * + findDeclPointeeMutation(ArrayRef Matches); + + const Stmt *findDirectMutation(const Expr *Exp); + const Stmt *findMemberMutation(const Expr *Exp); + const Stmt *findArrayElementMutation(const Expr *Exp); + const Stmt *findCastMutation(const Expr *Exp); + const Stmt *findRangeLoopMutation(const Expr *Exp); + const Stmt *findReferenceMutation(const Expr *Exp); + const Stmt *findFunctionArgMutation(const Expr *Exp); + + const Stmt &Stm; + ASTContext &Context; + Memoized &Memorized; + }; + ExprMutationAnalyzer(const Stmt &Stm, ASTContext &Context) - : Stm(Stm), Context(Context) {} + : Memorized(), A(Stm, Context, Memorized) {} bool isMutated(const Expr *Exp) { return findMutation(Exp) != nullptr; } bool isMutated(const Decl *Dec) { return findMutation(Dec) != nullptr; } - const Stmt *findMutation(const Expr *Exp); - const Stmt *findMutation(const Decl *Dec); + const Stmt *findMutation(const Expr *Exp) { return A.findMutation(Exp); } + const Stmt *findMutation(const Decl *Dec) { return A.findMutation(Dec); } bool isPointeeMutated(const Expr *Exp) { return findPointeeMutation(Exp) != nullptr; @@ -36,51 +94,40 @@ class ExprMutationAnalyzer { bool isPointeeMutated(const Decl *Dec) { return findPointeeMutation(Dec) != nullptr; } - const Stmt *findPointeeMutation(const Expr *Exp); - const Stmt *findPointeeMutation(const Decl *Dec); + const Stmt *findPointeeMutation(const Expr *Exp) { + return A.findPointeeMutation(Exp); + } + const Stmt *findPointeeMutation(const Decl *Dec) { + return A.findPointeeMutation(Dec); + } + static bool isUnevaluated(const Stmt *Smt, const Stmt &Stm, - ASTContext &Context); + ASTContext &Context) { + return Analyzer::isUnevaluated(Smt, Stm, Context); + } private: - using MutationFinder = const Stmt *(ExprMutationAnalyzer::*)(const Expr *); - using ResultMap = llvm::DenseMap; - - const Stmt *findMutationMemoized(const Expr *Exp, - llvm::ArrayRef Finders, - ResultMap &MemoizedResults); - const Stmt *tryEachDeclRef(const Decl *Dec, MutationFinder Finder); - - bool isUnevaluated(const Expr *Exp); - - const Stmt *findExprMutation(ArrayRef Matches); - const Stmt *findDeclMutation(ArrayRef Matches); - const Stmt * - findExprPointeeMutation(ArrayRef Matches); - const Stmt * - findDeclPointeeMutation(ArrayRef Matches); - - const Stmt *findDirectMutation(const Expr *Exp); - const Stmt *findMemberMutation(const Expr *Exp); - const Stmt *findArrayElementMutation(const Expr *Exp); - const Stmt *findCastMutation(const Expr *Exp); - const Stmt *findRangeLoopMutation(const Expr *Exp); - const Stmt *findReferenceMutation(const Expr *Exp); - const Stmt *findFunctionArgMutation(const Expr *Exp); - - const Stmt &Stm; - ASTContext &Context; - llvm::DenseMap> - FuncParmAnalyzer; - ResultMap Results; - ResultMap PointeeResults; + Memoized Memorized; + Analyzer A; }; // A convenient wrapper around ExprMutationAnalyzer for analyzing function // params. class FunctionParmMutationAnalyzer { public: - FunctionParmMutationAnalyzer(const FunctionDecl &Func, ASTContext &Context); + static FunctionParmMutationAnalyzer * + getFunctionParmMutationAnalyzer(const FunctionDecl &Func, ASTContext &Context, + ExprMutationAnalyzer::Memoized &Memorized) { + auto it = Memorized.FuncParmAnalyzer.find(&Func); + if (it == Memorized.FuncParmAnalyzer.end()) + it = + Memorized.FuncParmAnalyzer + .try_emplace(&Func, std::unique_ptr( + new FunctionParmMutationAnalyzer( + Func, Context, Memorized))) + .first; + return it->getSecond().get(); + } bool isMutated(const ParmVarDecl *Parm) { return findMutation(Parm) != nullptr; @@ -88,8 +135,11 @@ class FunctionParmMutationAnalyzer { const Stmt *findMutation(const ParmVarDecl *Parm); private: - ExprMutationAnalyzer BodyAnalyzer; + ExprMutationAnalyzer::Analyzer BodyAnalyzer; llvm::DenseMap Results; + + FunctionParmMutationAnalyzer(const FunctionDecl &Func, ASTContext &Context, + ExprMutationAnalyzer::Memoized &Memorized); }; } // namespace clang diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp index bb042760d297a7..941322be8f870b 100644 --- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp +++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp @@ -186,9 +186,10 @@ template <> struct NodeID { static constexpr StringRef value = "decl"; }; constexpr StringRef NodeID::value; constexpr StringRef NodeID::value; -template +template const Stmt *tryEachMatch(ArrayRef Matches, - ExprMutationAnalyzer *Analyzer, F Finder) { + ExprMutationAnalyzer::Analyzer *Analyzer, F Finder) { const StringRef ID = NodeID::value; for (const auto &Nodes : Matches) { if (const Stmt *S = (Analyzer->*Finder)(Nodes.getNodeAs(ID))) @@ -199,33 +200,37 @@ const Stmt *tryEachMatch(ArrayRef Matches, } // namespace -const Stmt *ExprMutationAnalyzer::findMutation(const Expr *Exp) { - return findMutationMemoized(Exp, - {&ExprMutationAnalyzer::findDirectMutation, - &ExprMutationAnalyzer::findMemberMutation, - &ExprMutationAnalyzer::findArrayElementMutation, - &ExprMutationAnalyzer::findCastMutation, - &ExprMutationAnalyzer::findRangeLoopMutation, - &ExprMutationAnalyzer::findReferenceMutation, - &ExprMutationAnalyzer::findFunctionArgMutation}, - Results); +const Stmt *ExprMutationAnalyzer::Analyzer::findMutation(const Expr *Exp) { + return findMutationMemoized( + Exp, + {&ExprMutationAnalyzer::Analyzer::findDirectMutation, + &ExprMutationAnalyzer::Analyzer::findMemberMutation, + &ExprMutationAnalyzer::Analyzer::findArrayElementMutation, + &ExprMutationAnalyzer::Analyzer::findCastMutation, + &ExprMutationAnalyzer::Analyzer::findRangeLoopMutation, + &ExprMutationAnalyzer::Analyzer::findReferenceMutation, + &ExprMutationAnalyzer::Analyzer::findFunctionArgMutation}, + Memorized.Results); } -const Stmt *ExprMutationAnalyzer::findMutation(const Decl *Dec) { - return tryEachDeclRef(Dec, &ExprMutationAnalyzer::findMutation); +const Stmt *ExprMutationAnalyzer::Analyzer::findMutation(const Decl *Dec) { + return tryEachDeclRef(Dec, &ExprMutationAnalyzer::Analyzer::findMutation); } -const Stmt *ExprMutationAnalyzer::findPointeeMutation(const Expr *Exp) { - return findMutationMemoized(Exp, {/*TODO*/}, PointeeResults); +const Stmt * +ExprMutationAnalyzer::Analyzer::findPointeeMutation(const Expr *Exp) { + return findMutationMemoized(Exp, {/*TODO*/}, Memorized.PointeeResults); } -const Stmt *ExprMutationAnalyzer::findPointeeMutation(const Decl *Dec) { - return tryEachDeclRef(Dec, &ExprMutationAnalyzer::findPointeeMutation); +const Stmt * +ExprMutationAnalyzer::Analyzer::findPointeeMutation(const Decl *Dec) { + return tryEachDeclRef(Dec, + &ExprMutationAnalyzer::Analyzer::findPointeeMutation); } -const Stmt *ExprMutationAnalyzer::findMutationMemoized( +const Stmt *ExprMutationAnalyzer::Analyzer::findMutationMemoized( const Expr *Exp, llvm::ArrayRef Finders, - ResultMap &MemoizedResults) { + Memoized::ResultMap &MemoizedResults) { const auto Memoized = MemoizedResults.find(Exp); if (Memoized != MemoizedResults.end()) return Memoized->second; @@ -241,8 +246,9 @@ const Stmt *ExprMutationAnalyzer::findMutationMemoized( return MemoizedResults[Exp] = nullptr; } -const Stmt *ExprMutationAnalyzer::tryEachDeclRef(const Decl *Dec, - MutationFinder Finder) { +const Stmt * +ExprMutationAnalyzer::Analyzer::tryEachDeclRef(const Decl *Dec, + MutationFinder Finder) { const auto Refs = match( findAll( declRefExpr(to( @@ -261,8 +267,9 @@ const Stmt *ExprMutationAnalyzer::tryEachDeclRef(const Decl *Dec, return nullptr; } -bool ExprMutationAnalyzer::isUnevaluated(const Stmt *Exp, const Stmt &Stm, - ASTContext &Context) { +bool ExprMutationAnalyzer::Analyzer::isUnevaluated(const Stmt *Exp, + const Stmt &Stm, + ASTContext &Context) { return selectFirst( NodeID::value, match( @@ -293,33 +300,36 @@ bool ExprMutationAnalyzer::isUnevaluated(const Stmt *Exp, const Stmt &Stm, Stm, Context)) != nullptr; } -bool ExprMutationAnalyzer::isUnevaluated(const Expr *Exp) { +bool ExprMutationAnalyzer::Analyzer::isUnevaluated(const Expr *Exp) { return isUnevaluated(Exp, Stm, Context); } const Stmt * -ExprMutationAnalyzer::findExprMutation(ArrayRef Matches) { - return tryEachMatch(Matches, this, &ExprMutationAnalyzer::findMutation); +ExprMutationAnalyzer::Analyzer::findExprMutation(ArrayRef Matches) { + return tryEachMatch(Matches, this, + &ExprMutationAnalyzer::Analyzer::findMutation); } const Stmt * -ExprMutationAnalyzer::findDeclMutation(ArrayRef Matches) { - return tryEachMatch(Matches, this, &ExprMutationAnalyzer::findMutation); +ExprMutationAnalyzer::Analyzer::findDeclMutation(ArrayRef Matches) { + return tryEachMatch(Matches, this, + &ExprMutationAnalyzer::Analyzer::findMutation); } -const Stmt *ExprMutationAnalyzer::findExprPointeeMutation( +const Stmt *ExprMutationAnalyzer::Analyzer::findExprPointeeMutation( ArrayRef Matches) { - return tryEachMatch(Matches, this, - &ExprMutationAnalyzer::findPointeeMutation); + return tryEachMatch( + Matches, this, &ExprMutationAnalyzer::Analyzer::findPointeeMutation); } -const Stmt *ExprMutationAnalyzer::findDeclPointeeMutation( +const Stmt *ExprMutationAnalyzer::Analyzer::findDeclPointeeMutation( ArrayRef Matches) { - return tryEachMatch(Matches, this, - &ExprMutationAnalyzer::findPointeeMutation); + return tryEachMatch( + Matches, this, &ExprMutationAnalyzer::Analyzer::findPointeeMutation); } -const Stmt *ExprMutationAnalyzer::findDirectMutation(const Expr *Exp) { +const Stmt * +ExprMutationAnalyzer::Analyzer::findDirectMutation(const Expr *Exp) { // LHS of any assignment operators. const auto AsAssignmentLhs = binaryOperator(isAssignmentOperator(), hasLHS(canResolveToExpr(Exp))); @@ -426,7 +436,7 @@ const Stmt *ExprMutationAnalyzer::findDirectMutation(const Expr *Exp) { const auto AsNonConstRefReturn = returnStmt(hasReturnValue(canResolveToExpr(Exp))); - // It is used as a non-const-reference for initalizing a range-for loop. + // It is used as a non-const-reference for initializing a range-for loop. const auto AsNonConstRefRangeInit = cxxForRangeStmt(hasRangeInit(declRefExpr( allOf(canResolveToExpr(Exp), hasType(nonConstReferenceType()))))); @@ -443,7 +453,8 @@ const Stmt *ExprMutationAnalyzer::findDirectMutation(const Expr *Exp) { return selectFirst("stmt", Matches); } -const Stmt *ExprMutationAnalyzer::findMemberMutation(const Expr *Exp) { +const Stmt * +ExprMutationAnalyzer::Analyzer::findMemberMutation(const Expr *Exp) { // Check whether any member of 'Exp' is mutated. const auto MemberExprs = match( findAll(expr(anyOf(memberExpr(hasObjectExpression(canResolveToExpr(Exp))), @@ -456,7 +467,8 @@ const Stmt *ExprMutationAnalyzer::findMemberMutation(const Expr *Exp) { return findExprMutation(MemberExprs); } -const Stmt *ExprMutationAnalyzer::findArrayElementMutation(const Expr *Exp) { +const Stmt * +ExprMutationAnalyzer::Analyzer::findArrayElementMutation(const Expr *Exp) { // Check whether any element of an array is mutated. const auto SubscriptExprs = match( findAll(arraySubscriptExpr( @@ -469,7 +481,7 @@ const Stmt *ExprMutationAnalyzer::findArrayElementMutation(const Expr *Exp) { return findExprMutation(SubscriptExprs); } -const Stmt *ExprMutationAnalyzer::findCastMutation(const Expr *Exp) { +const Stmt *ExprMutationAnalyzer::Analyzer::findCastMutation(const Expr *Exp) { // If the 'Exp' is explicitly casted to a non-const reference type the // 'Exp' is considered to be modified. const auto ExplicitCast = @@ -504,7 +516,8 @@ const Stmt *ExprMutationAnalyzer::findCastMutation(const Expr *Exp) { return findExprMutation(Calls); } -const Stmt *ExprMutationAnalyzer::findRangeLoopMutation(const Expr *Exp) { +const Stmt * +ExprMutationAnalyzer::Analyzer::findRangeLoopMutation(const Expr *Exp) { // Keep the ordering for the specific initialization matches to happen first, // because it is cheaper to match all potential modifications of the loop // variable. @@ -567,7 +580,8 @@ const Stmt *ExprMutationAnalyzer::findRangeLoopMutation(const Expr *Exp) { return findDeclMutation(LoopVars); } -const Stmt *ExprMutationAnalyzer::findReferenceMutation(const Expr *Exp) { +const Stmt * +ExprMutationAnalyzer::Analyzer::findReferenceMutation(const Expr *Exp) { // Follow non-const reference returned by `operator*()` of move-only classes. // These are typically smart pointers with unique ownership so we treat // mutation of pointee as mutation of the smart pointer itself. @@ -599,7 +613,8 @@ const Stmt *ExprMutationAnalyzer::findReferenceMutation(const Expr *Exp) { return findDeclMutation(Refs); } -const Stmt *ExprMutationAnalyzer::findFunctionArgMutation(const Expr *Exp) { +const Stmt * +ExprMutationAnalyzer::Analyzer::findFunctionArgMutation(const Expr *Exp) { const auto NonConstRefParam = forEachArgumentWithParam( canResolveToExpr(Exp), parmVarDecl(hasType(nonConstReferenceType())).bind("parm")); @@ -637,10 +652,9 @@ const Stmt *ExprMutationAnalyzer::findFunctionArgMutation(const Expr *Exp) { if (const auto *RefType = ParmType->getAs()) { if (!RefType->getPointeeType().getQualifiers() && RefType->getPointeeType()->getAs()) { - std::unique_ptr &Analyzer = - FuncParmAnalyzer[Func]; - if (!Analyzer) - Analyzer.reset(new FunctionParmMutationAnalyzer(*Func, Context)); + FunctionParmMutationAnalyzer *Analyzer = + FunctionParmMutationAnalyzer::getFunctionParmMutationAnalyzer( + *Func, Context, Memorized); if (Analyzer->findMutation(Parm)) return Exp; continue; @@ -653,13 +667,15 @@ const Stmt *ExprMutationAnalyzer::findFunctionArgMutation(const Expr *Exp) { } FunctionParmMutationAnalyzer::FunctionParmMutationAnalyzer( - const FunctionDecl &Func, ASTContext &Context) - : BodyAnalyzer(*Func.getBody(), Context) { + const FunctionDecl &Func, ASTContext &Context, + ExprMutationAnalyzer::Memoized &Memorized) + : BodyAnalyzer(*Func.getBody(), Context, Memorized) { if (const auto *Ctor = dyn_cast(&Func)) { // CXXCtorInitializer might also mutate Param but they're not part of // function body, check them eagerly here since they're typically trivial. for (const CXXCtorInitializer *Init : Ctor->inits()) { - ExprMutationAnalyzer InitAnalyzer(*Init->getInit(), Context); + ExprMutationAnalyzer::Analyzer InitAnalyzer(*Init->getInit(), Context, + Memorized); for (const ParmVarDecl *Parm : Ctor->parameters()) { if (Results.contains(Parm)) continue; @@ -675,11 +691,14 @@ FunctionParmMutationAnalyzer::findMutation(const ParmVarDecl *Parm) { const auto Memoized = Results.find(Parm); if (Memoized != Results.end()) return Memoized->second; - + // To handle call A -> call B -> call A. Assume parameters of A is not mutated + // before analyzing parameters of A. Then when analyzing the second "call A", + // FunctionParmMutationAnalyzer can use this memoized value to avoid infinite + // recursion. + Results[Parm] = nullptr; if (const Stmt *S = BodyAnalyzer.findMutation(Parm)) return Results[Parm] = S; - - return Results[Parm] = nullptr; + return Results[Parm]; } } // namespace clang diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp index f58ce4aebcbfc8..9c1dc1a76db63d 100644 --- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp +++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp @@ -977,6 +977,36 @@ TEST(ExprMutationAnalyzerTest, FollowFuncArgModified) { "void f() { int x; g(x); }"); Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext()); EXPECT_THAT(mutatedBy(Results, AST.get()), ElementsAre("g(x)")); + + AST = buildASTFromCode( + StdRemoveReference + StdForward + + "template void f1(T &&a);" + "template void f2(T &&a);" + "template void f1(T &&a) { f2(std::forward(a)); }" + "template void f2(T &&a) { f1(std::forward(a)); }" + "void f() { int x; f1(x); }"); + Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext()); + EXPECT_FALSE(isMutated(Results, AST.get())); + + AST = buildASTFromCode( + StdRemoveReference + StdForward + + "template void f1(T &&a);" + "template void f2(T &&a);" + "template void f1(T &&a) { f2(std::forward(a)); }" + "template void f2(T &&a) { f1(std::forward(a)); a++; }" + "void f() { int x; f1(x); }"); + Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext()); + EXPECT_THAT(mutatedBy(Results, AST.get()), ElementsAre("f1(x)")); + + AST = buildASTFromCode( + StdRemoveReference + StdForward + + "template void f1(T &&a);" + "template void f2(T &&a);" + "template void f1(T &&a) { f2(std::forward(a)); a++; }" + "template void f2(T &&a) { f1(std::forward(a)); }" + "void f() { int x; f1(x); }"); + Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext()); + EXPECT_THAT(mutatedBy(Results, AST.get()), ElementsAre("f1(x)")); } TEST(ExprMutationAnalyzerTest, FollowFuncArgNotModified) { From 3204f3e30b8e15ce6e5d10d5d7bfbaa7cf5cd1f4 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 17 Apr 2024 10:07:57 +0800 Subject: [PATCH 198/300] [RISCV] Convert VTYPE operand check to assert in RISCVInsertVSETVLI. NFC The VTYPE operands of a vsetvli pseudo are always immediates --- llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index a14f9a28354737..a54a1148cf28b9 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -1522,9 +1522,7 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI, } } - if (!PrevMI.getOperand(2).isImm() || !MI.getOperand(2).isImm()) - return false; - + assert(PrevMI.getOperand(2).isImm() && MI.getOperand(2).isImm()); auto PriorVType = PrevMI.getOperand(2).getImm(); auto VType = MI.getOperand(2).getImm(); return areCompatibleVTYPEs(PriorVType, VType, Used); From c81e5faa6f55d3e390b5e550f78ab08fc6a65ee9 Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Wed, 17 Apr 2024 10:42:40 +0800 Subject: [PATCH 199/300] [RISCV] Add CFI information for vector callee-saved registers (#86811) Currently the CFI offset for RVV registers are not handled entirely, this patch add those information for either stack unwinding or debugger to work correctly on RVV callee-saved stack object. Depends On D154576 Differential Revision: https://reviews.llvm.org/D156846 --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 109 ++++++++++++++--- llvm/lib/Target/RISCV/RISCVFrameLowering.h | 3 + llvm/test/CodeGen/RISCV/rvv-cfi-info.ll | 111 ++++++++++++++++++ .../rvv/fixed-vectors-insert-subvector.ll | 2 + 4 files changed, 207 insertions(+), 18 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv-cfi-info.ll diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 71672ed7b4ae7f..cb41577c5d9435 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -435,6 +435,33 @@ void RISCVFrameLowering::adjustStackForRVV(MachineFunction &MF, Flag, getStackAlign()); } +static void appendScalableVectorExpression(const TargetRegisterInfo &TRI, + SmallVectorImpl &Expr, + int FixedOffset, int ScalableOffset, + llvm::raw_string_ostream &Comment) { + unsigned DwarfVLenB = TRI.getDwarfRegNum(RISCV::VLENB, true); + uint8_t Buffer[16]; + if (FixedOffset) { + Expr.push_back(dwarf::DW_OP_consts); + Expr.append(Buffer, Buffer + encodeSLEB128(FixedOffset, Buffer)); + Expr.push_back((uint8_t)dwarf::DW_OP_plus); + Comment << (FixedOffset < 0 ? " - " : " + ") << std::abs(FixedOffset); + } + + Expr.push_back((uint8_t)dwarf::DW_OP_consts); + Expr.append(Buffer, Buffer + encodeSLEB128(ScalableOffset, Buffer)); + + Expr.push_back((uint8_t)dwarf::DW_OP_bregx); + Expr.append(Buffer, Buffer + encodeULEB128(DwarfVLenB, Buffer)); + Expr.push_back(0); + + Expr.push_back((uint8_t)dwarf::DW_OP_mul); + Expr.push_back((uint8_t)dwarf::DW_OP_plus); + + Comment << (ScalableOffset < 0 ? " - " : " + ") << std::abs(ScalableOffset) + << " * vlenb"; +} + static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, Register Reg, uint64_t FixedOffset, @@ -452,30 +479,38 @@ static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI, else Comment << printReg(Reg, &TRI); - uint8_t buffer[16]; - if (FixedOffset) { - Expr.push_back(dwarf::DW_OP_consts); - Expr.append(buffer, buffer + encodeSLEB128(FixedOffset, buffer)); - Expr.push_back((uint8_t)dwarf::DW_OP_plus); - Comment << " + " << FixedOffset; - } + appendScalableVectorExpression(TRI, Expr, FixedOffset, ScalableOffset, + Comment); - Expr.push_back((uint8_t)dwarf::DW_OP_consts); - Expr.append(buffer, buffer + encodeSLEB128(ScalableOffset, buffer)); + SmallString<64> DefCfaExpr; + uint8_t Buffer[16]; + DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); + DefCfaExpr.append(Buffer, Buffer + encodeULEB128(Expr.size(), Buffer)); + DefCfaExpr.append(Expr.str()); - unsigned DwarfVlenb = TRI.getDwarfRegNum(RISCV::VLENB, true); - Expr.push_back((uint8_t)dwarf::DW_OP_bregx); - Expr.append(buffer, buffer + encodeULEB128(DwarfVlenb, buffer)); - Expr.push_back(0); + return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(), + Comment.str()); +} - Expr.push_back((uint8_t)dwarf::DW_OP_mul); - Expr.push_back((uint8_t)dwarf::DW_OP_plus); +static MCCFIInstruction createDefCFAOffset(const TargetRegisterInfo &TRI, + Register Reg, uint64_t FixedOffset, + uint64_t ScalableOffset) { + assert(ScalableOffset != 0 && "Did not need to adjust CFA for RVV"); + SmallString<64> Expr; + std::string CommentBuffer; + llvm::raw_string_ostream Comment(CommentBuffer); + Comment << printReg(Reg, &TRI) << " @ cfa"; - Comment << " + " << ScalableOffset << " * vlenb"; + // Build up the expression (FixedOffset + ScalableOffset * VLENB). + appendScalableVectorExpression(TRI, Expr, FixedOffset, ScalableOffset, + Comment); SmallString<64> DefCfaExpr; - DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression); - DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer)); + uint8_t Buffer[16]; + unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true); + DefCfaExpr.push_back(dwarf::DW_CFA_expression); + DefCfaExpr.append(Buffer, Buffer + encodeULEB128(DwarfReg, Buffer)); + DefCfaExpr.append(Buffer, Buffer + encodeULEB128(Expr.size(), Buffer)); DefCfaExpr.append(Expr.str()); return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(), @@ -671,6 +706,9 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, .addCFIIndex(CFIIndex) .setMIFlag(MachineInstr::FrameSetup); } + + std::advance(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); + emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF)); } if (hasFP(MF)) { @@ -1492,6 +1530,41 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters( return true; } +void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, bool HasFP) const { + MachineFunction *MF = MBB.getParent(); + const MachineFrameInfo &MFI = MF->getFrameInfo(); + RISCVMachineFunctionInfo *RVFI = MF->getInfo(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MI); + + const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, MFI.getCalleeSavedInfo()); + if (RVVCSI.empty()) + return; + + uint64_t FixedSize = getStackSizeWithRVVPadding(*MF); + if (!HasFP) { + uint64_t ScalarLocalVarSize = + MFI.getStackSize() - RVFI->getCalleeSavedStackSize() - + RVFI->getRVPushStackSize() - RVFI->getVarArgsSaveSize() + + RVFI->getRVVPadding(); + FixedSize -= ScalarLocalVarSize; + } + + for (auto &CS : RVVCSI) { + // Insert the spill to the stack frame. + int FI = CS.getFrameIdx(); + if (FI >= 0 && MFI.getStackID(FI) == TargetStackID::ScalableVector) { + unsigned CFIIndex = MF->addFrameInst( + createDefCFAOffset(*STI.getRegisterInfo(), CS.getReg(), -FixedSize, + MFI.getObjectOffset(FI) / 8)); + BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); + } + } +} + bool RISCVFrameLowering::restoreCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, MutableArrayRef CSI, const TargetRegisterInfo *TRI) const { diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index 210f8c1064724a..28ab4aff3b9d51 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -88,6 +88,9 @@ class RISCVFrameLowering : public TargetFrameLowering { void adjustStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount, MachineInstr::MIFlag Flag) const; + void emitCalleeSavedRVVPrologCFI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + bool HasFP) const; std::pair assignRVVStackObjectOffsets(MachineFunction &MF) const; }; diff --git a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll new file mode 100644 index 00000000000000..c99388cbdaf441 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=OMIT-FP %s +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs -frame-pointer=all < %s \ +; RUN: | FileCheck -check-prefix=NO-OMIT-FP %s + +define riscv_vector_cc @test_vector_callee_cfi( %va) { +; OMIT-FP-LABEL: test_vector_callee_cfi: +; OMIT-FP: # %bb.0: # %entry +; OMIT-FP-NEXT: addi sp, sp, -16 +; OMIT-FP-NEXT: .cfi_def_cfa_offset 16 +; OMIT-FP-NEXT: csrr a0, vlenb +; OMIT-FP-NEXT: slli a0, a0, 3 +; OMIT-FP-NEXT: sub sp, sp, a0 +; OMIT-FP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; OMIT-FP-NEXT: csrr a0, vlenb +; OMIT-FP-NEXT: li a1, 6 +; OMIT-FP-NEXT: mul a0, a0, a1 +; OMIT-FP-NEXT: add a0, sp, a0 +; OMIT-FP-NEXT: addi a0, a0, 16 +; OMIT-FP-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill +; OMIT-FP-NEXT: csrr a0, vlenb +; OMIT-FP-NEXT: slli a0, a0, 2 +; OMIT-FP-NEXT: add a0, sp, a0 +; OMIT-FP-NEXT: addi a0, a0, 16 +; OMIT-FP-NEXT: vs2r.v v2, (a0) # Unknown-size Folded Spill +; OMIT-FP-NEXT: addi a0, sp, 16 +; OMIT-FP-NEXT: vs4r.v v4, (a0) # Unknown-size Folded Spill +; OMIT-FP-NEXT: .cfi_escape 0x10, 0x61, 0x08, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 2 * vlenb +; OMIT-FP-NEXT: .cfi_escape 0x10, 0x62, 0x08, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2m2 @ cfa - 4 * vlenb +; OMIT-FP-NEXT: .cfi_escape 0x10, 0x64, 0x08, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4m4 @ cfa - 8 * vlenb +; OMIT-FP-NEXT: #APP +; OMIT-FP-NEXT: #NO_APP +; OMIT-FP-NEXT: csrr a0, vlenb +; OMIT-FP-NEXT: li a1, 6 +; OMIT-FP-NEXT: mul a0, a0, a1 +; OMIT-FP-NEXT: add a0, sp, a0 +; OMIT-FP-NEXT: addi a0, a0, 16 +; OMIT-FP-NEXT: vl1r.v v1, (a0) # Unknown-size Folded Reload +; OMIT-FP-NEXT: csrr a0, vlenb +; OMIT-FP-NEXT: slli a0, a0, 2 +; OMIT-FP-NEXT: add a0, sp, a0 +; OMIT-FP-NEXT: addi a0, a0, 16 +; OMIT-FP-NEXT: vl2r.v v2, (a0) # Unknown-size Folded Reload +; OMIT-FP-NEXT: addi a0, sp, 16 +; OMIT-FP-NEXT: vl4r.v v4, (a0) # Unknown-size Folded Reload +; OMIT-FP-NEXT: csrr a0, vlenb +; OMIT-FP-NEXT: slli a0, a0, 3 +; OMIT-FP-NEXT: add sp, sp, a0 +; OMIT-FP-NEXT: addi sp, sp, 16 +; OMIT-FP-NEXT: ret +; +; NO-OMIT-FP-LABEL: test_vector_callee_cfi: +; NO-OMIT-FP: # %bb.0: # %entry +; NO-OMIT-FP-NEXT: addi sp, sp, -32 +; NO-OMIT-FP-NEXT: .cfi_def_cfa_offset 32 +; NO-OMIT-FP-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; NO-OMIT-FP-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; NO-OMIT-FP-NEXT: .cfi_offset ra, -8 +; NO-OMIT-FP-NEXT: .cfi_offset s0, -16 +; NO-OMIT-FP-NEXT: addi s0, sp, 32 +; NO-OMIT-FP-NEXT: .cfi_def_cfa s0, 0 +; NO-OMIT-FP-NEXT: csrr a0, vlenb +; NO-OMIT-FP-NEXT: slli a0, a0, 3 +; NO-OMIT-FP-NEXT: sub sp, sp, a0 +; NO-OMIT-FP-NEXT: csrr a0, vlenb +; NO-OMIT-FP-NEXT: slli a0, a0, 1 +; NO-OMIT-FP-NEXT: sub a0, s0, a0 +; NO-OMIT-FP-NEXT: addi a0, a0, -32 +; NO-OMIT-FP-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill +; NO-OMIT-FP-NEXT: csrr a0, vlenb +; NO-OMIT-FP-NEXT: slli a0, a0, 2 +; NO-OMIT-FP-NEXT: sub a0, s0, a0 +; NO-OMIT-FP-NEXT: addi a0, a0, -32 +; NO-OMIT-FP-NEXT: vs2r.v v2, (a0) # Unknown-size Folded Spill +; NO-OMIT-FP-NEXT: csrr a0, vlenb +; NO-OMIT-FP-NEXT: slli a0, a0, 3 +; NO-OMIT-FP-NEXT: sub a0, s0, a0 +; NO-OMIT-FP-NEXT: addi a0, a0, -32 +; NO-OMIT-FP-NEXT: vs4r.v v4, (a0) # Unknown-size Folded Spill +; NO-OMIT-FP-NEXT: .cfi_escape 0x10, 0x61, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 32 - 2 * vlenb +; NO-OMIT-FP-NEXT: .cfi_escape 0x10, 0x62, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2m2 @ cfa - 32 - 4 * vlenb +; NO-OMIT-FP-NEXT: .cfi_escape 0x10, 0x64, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4m4 @ cfa - 32 - 8 * vlenb +; NO-OMIT-FP-NEXT: #APP +; NO-OMIT-FP-NEXT: #NO_APP +; NO-OMIT-FP-NEXT: csrr a0, vlenb +; NO-OMIT-FP-NEXT: slli a0, a0, 1 +; NO-OMIT-FP-NEXT: sub a0, s0, a0 +; NO-OMIT-FP-NEXT: addi a0, a0, -32 +; NO-OMIT-FP-NEXT: vl1r.v v1, (a0) # Unknown-size Folded Reload +; NO-OMIT-FP-NEXT: csrr a0, vlenb +; NO-OMIT-FP-NEXT: slli a0, a0, 2 +; NO-OMIT-FP-NEXT: sub a0, s0, a0 +; NO-OMIT-FP-NEXT: addi a0, a0, -32 +; NO-OMIT-FP-NEXT: vl2r.v v2, (a0) # Unknown-size Folded Reload +; NO-OMIT-FP-NEXT: csrr a0, vlenb +; NO-OMIT-FP-NEXT: slli a0, a0, 3 +; NO-OMIT-FP-NEXT: sub a0, s0, a0 +; NO-OMIT-FP-NEXT: addi a0, a0, -32 +; NO-OMIT-FP-NEXT: vl4r.v v4, (a0) # Unknown-size Folded Reload +; NO-OMIT-FP-NEXT: addi sp, s0, -32 +; NO-OMIT-FP-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; NO-OMIT-FP-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; NO-OMIT-FP-NEXT: addi sp, sp, 32 +; NO-OMIT-FP-NEXT: ret +entry: + call void asm sideeffect "", + "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() + + ret %va +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll index 1d3c22a02efc0f..ab6df1d3e883fd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -628,6 +628,7 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) { ; RV32-NEXT: vs8r.v v8, (a0) ; RV32-NEXT: vs8r.v v16, (a1) ; RV32-NEXT: addi sp, s0, -80 +; RV32-NEXT: .cfi_def_cfa sp, 80 ; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 80 @@ -661,6 +662,7 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) { ; RV64-NEXT: vs8r.v v8, (a0) ; RV64-NEXT: vs8r.v v16, (a1) ; RV64-NEXT: addi sp, s0, -80 +; RV64-NEXT: .cfi_def_cfa sp, 80 ; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 80 From e6ecff8d95b9175e70e0d43e14c2975c8f69d718 Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Wed, 17 Apr 2024 10:40:09 +0800 Subject: [PATCH 200/300] [C++20] [Modules] Add Release Notes and Documents for Reduced BMI See https://discourse.llvm.org/t/rfc-c-20-modules-introduce-thin-bmi-and-decls-hash/74755, https://github.com/llvm/llvm-project/pull/75894 and https://github.com/llvm/llvm-project/pull/85050 for the background. --- clang/docs/ReleaseNotes.rst | 3 + clang/docs/StandardCPlusPlusModules.rst | 106 ++++++++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 3752b6ce157600..efc32212f300cf 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -214,6 +214,9 @@ New Compiler Flags This diagnostic can be disabled to make ``-Wmissing-field-initializers`` behave like it did before Clang 18.x. Fixes #GH56628 +- ``-fexperimental-modules-reduced-bmi`` enables the Reduced BMI for C++20 named modules. + See the document of standard C++ modules for details. + Deprecated Compiler Flags ------------------------- diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst index c5478bba45f389..8d5529d5d37db5 100644 --- a/clang/docs/StandardCPlusPlusModules.rst +++ b/clang/docs/StandardCPlusPlusModules.rst @@ -520,6 +520,112 @@ is attached to the global module fragments. For example: Now the linkage name of ``NS::foo()`` will be ``_ZN2NS3fooEv``. +Reduced BMI +----------- + +To support the 2 phase compilation model, Clang chose to put everything needed to +produce an object into the BMI. But every consumer of the BMI, except itself, doesn't +need such informations. It makes the BMI to larger and so may introduce unnecessary +dependencies into the BMI. To mitigate the problem, we decided to reduce the information +contained in the BMI. + +To be clear, we call the default BMI as Full BMI and the new introduced BMI as Reduced +BMI. + +Users can use ``-fexperimental-modules-reduced-bmi`` flag to enable the Reduced BMI. + +For one phase compilation model (CMake implements this model), with +``-fexperimental-modules-reduced-bmi``, the generated BMI will be Reduced BMI automatically. +(The output path of the BMI is specified by ``-fmodule-output=`` as usual one phase +compilation model). + +It is still possible to support Reduced BMI in two phase compilation model. With +``-fexperimental-modules-reduced-bmi``, ``--precompile`` and ``-fmodule-output=`` specified, +the generated BMI specified by ``-o`` will be full BMI and the BMI specified by +``-fmodule-output=`` will be Reduced BMI. The dependency graph may be: + +.. code-block:: none + + module-unit.cppm --> module-unit.full.pcm -> module-unit.o + | + -> module-unit.reduced.pcm -> consumer1.cpp + -> consumer2.cpp + -> ... + -> consumer_n.cpp + +We don't emit diagnostics if ``-fexperimental-modules-reduced-bmi`` is used with a non-module +unit. This design helps the end users of one phase compilation model to perform experiments +early without asking for the help of build systems. The users of build systems which supports +two phase compilation model still need helps from build systems. + +Within Reduced BMI, we won't write unreachable entities from GMF, definitions of non-inline +functions and non-inline variables. This may not be a transparent change. +`[module.global.frag]ex2 `_ may be a good +example: + +.. code-block:: c++ + + // foo.h + namespace N { + struct X {}; + int d(); + int e(); + inline int f(X, int = d()) { return e(); } + int g(X); + int h(X); + } + + // M.cppm + module; + #include "foo.h" + export module M; + template int use_f() { + N::X x; // N::X, N, and :: are decl-reachable from use_f + return f(x, 123); // N::f is decl-reachable from use_f, + // N::e is indirectly decl-reachable from use_f + // because it is decl-reachable from N::f, and + // N::d is decl-reachable from use_f + // because it is decl-reachable from N::f + // even though it is not used in this call + } + template int use_g() { + N::X x; // N::X, N, and :: are decl-reachable from use_g + return g((T(), x)); // N::g is not decl-reachable from use_g + } + template int use_h() { + N::X x; // N::X, N, and :: are decl-reachable from use_h + return h((T(), x)); // N::h is not decl-reachable from use_h, but + // N::h is decl-reachable from use_h + } + int k = use_h(); + // use_h is decl-reachable from k, so + // N::h is decl-reachable from k + + // M-impl.cpp + module M; + int a = use_f(); // OK + int b = use_g(); // error: no viable function for call to g; + // g is not decl-reachable from purview of + // module M's interface, so is discarded + int c = use_h(); // OK + +In the above example, the function definition of ``N::g`` is elided from the Reduced +BMI of ``M.cppm``. Then the use of ``use_g`` in ``M-impl.cpp`` fails +to instantiate. For such issues, users can add references to ``N::g`` in the module purview +of ``M.cppm`` to make sure it is reachable, e.g., ``using N::g;``. + +We think the Reduced BMI is the correct direction. But given it is a drastic change, +we'd like to make it experimental first to avoid breaking existing users. The roadmap +of Reduced BMI may be: + +1. ``-fexperimental-modules-reduced-bmi`` is opt in for 1~2 releases. The period depends +on testing feedbacks. +2. We would announce Reduced BMI is not experimental and introduce ``-fmodules-reduced-bmi``. +and suggest users to enable this mode. This may takes 1~2 releases too. +3. Finally we will enable this by default. When that time comes, the term BMI will refer to +the reduced BMI today and the Full BMI will only be meaningful to build systems which +loves to support two phase compilations. + Performance Tips ---------------- From eafd515ecaaa100623eebc7fa4d7c36a361bf708 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Tue, 16 Apr 2024 19:47:52 -0700 Subject: [PATCH 201/300] [clang][deps] Support single-file mode for all formats (#88764) The `clang-scan-deps` tool can be used for fast scanning of batches of compilation commands passed in via the `-compilation-database` option. This gets awkward in our tests where we have to resort to using `.in`/`.template` JSON files and running them through `sed` in order to embed LIT's `%t` variable into them. However, most of our tests only need to pass single compilation command, so this dance is entirely unnecessary. This patch makes sure the existing "per-file" mode (where the compilation command is passed in-line after the `--` argument) works for all output formats, not only `P1689`. --- clang/test/ClangScanDeps/error.cpp | 18 +----- clang/tools/clang-scan-deps/ClangScanDeps.cpp | 57 +++++++------------ 2 files changed, 23 insertions(+), 52 deletions(-) diff --git a/clang/test/ClangScanDeps/error.cpp b/clang/test/ClangScanDeps/error.cpp index 0095a6c900c3b3..593dbf35edca52 100644 --- a/clang/test/ClangScanDeps/error.cpp +++ b/clang/test/ClangScanDeps/error.cpp @@ -1,23 +1,10 @@ // RUN: rm -rf %t // RUN: split-file %s %t -//--- missing_tu.json.in -[{ - "directory": "DIR", - "command": "clang -fsyntax-only DIR/missing_tu.c", - "file": "DIR/missing_tu.c" -}] -//--- missing_header.json.in -[{ - "directory": "DIR", - "command": "clang -fsyntax-only DIR/missing_header.c", - "file": "DIR/missing_header.c" -}] //--- missing_header.c #include "missing.h" -// RUN: sed -e "s|DIR|%/t|g" %t/missing_tu.json.in > %t/missing_tu.json -// RUN: not clang-scan-deps -compilation-database %t/missing_tu.json 2>%t/missing_tu.errs +// RUN: not clang-scan-deps -- %clang -c %t/missing_tu.c 2>%t/missing_tu.errs // RUN: echo EOF >> %t/missing_tu.errs // RUN: cat %t/missing_tu.errs | sed 's:\\\\\?:/:g' | FileCheck %s --check-prefix=CHECK-MISSING-TU -DPREFIX=%/t // CHECK-MISSING-TU: Error while scanning dependencies for [[PREFIX]]/missing_tu.c @@ -26,8 +13,7 @@ // CHECK-MISSING-TU-NEXT: error: // CHECK-MISSING-TU-NEXT: EOF -// RUN: sed -e "s|DIR|%/t|g" %t/missing_header.json.in > %t/missing_header.json -// RUN: not clang-scan-deps -compilation-database %t/missing_header.json 2>%t/missing_header.errs +// RUN: not clang-scan-deps -- %clang -c %t/missing_header.c 2>%t/missing_header.errs // RUN: echo EOF >> %t/missing_header.errs // RUN: cat %t/missing_header.errs | sed 's:\\\\\?:/:g' | FileCheck %s --check-prefix=CHECK-MISSING-HEADER -DPREFIX=%/t // CHECK-MISSING-HEADER: Error while scanning dependencies for [[PREFIX]]/missing_header.c diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index eaa76dd43e41dd..94510515cd4403 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -98,8 +98,8 @@ static bool RoundTripArgs = DoRoundTripDefault; static void ParseArgs(int argc, char **argv) { ScanDepsOptTable Tbl; llvm::StringRef ToolName = argv[0]; - llvm::BumpPtrAllocator A; - llvm::StringSaver Saver{A}; + llvm::BumpPtrAllocator Alloc; + llvm::StringSaver Saver{Alloc}; llvm::opt::InputArgList Args = Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) { llvm::errs() << Msg << '\n'; @@ -186,14 +186,8 @@ static void ParseArgs(int argc, char **argv) { } } - if (const llvm::opt::Arg *A = Args.getLastArg(OPT_compilation_database_EQ)) { + if (const llvm::opt::Arg *A = Args.getLastArg(OPT_compilation_database_EQ)) CompilationDB = A->getValue(); - } else if (Format != ScanningOutputFormat::P1689) { - llvm::errs() << ToolName - << ": for the --compiilation-database option: must be " - "specified at least once!"; - std::exit(1); - } if (const llvm::opt::Arg *A = Args.getLastArg(OPT_module_name_EQ)) ModuleName = A->getValue(); @@ -225,9 +219,8 @@ static void ParseArgs(int argc, char **argv) { RoundTripArgs = Args.hasArg(OPT_round_trip_args); - if (auto *A = Args.getLastArgNoClaim(OPT_DASH_DASH)) - CommandLine.insert(CommandLine.end(), A->getValues().begin(), - A->getValues().end()); + if (const llvm::opt::Arg *A = Args.getLastArgNoClaim(OPT_DASH_DASH)) + CommandLine.assign(A->getValues().begin(), A->getValues().end()); } class SharedStream { @@ -694,38 +687,28 @@ static std::string getModuleCachePath(ArrayRef Args) { return std::string(Path); } -// getCompilationDataBase - If -compilation-database is set, load the -// compilation database from the specified file. Otherwise if the we're -// generating P1689 format, trying to generate the compilation database -// form specified command line after the positional parameter "--". +/// Attempts to construct the compilation database from '-compilation-database' +/// or from the arguments following the positional '--'. static std::unique_ptr -getCompilationDataBase(int argc, char **argv, std::string &ErrorMessage) { +getCompilationDatabase(int argc, char **argv, std::string &ErrorMessage) { ParseArgs(argc, argv); + if (!(CommandLine.empty() ^ CompilationDB.empty())) { + llvm::errs() << "The compilation command line must be provided either via " + "'-compilation-database' or after '--'."; + return nullptr; + } + if (!CompilationDB.empty()) return tooling::JSONCompilationDatabase::loadFromFile( CompilationDB, ErrorMessage, tooling::JSONCommandLineSyntax::AutoDetect); - if (Format != ScanningOutputFormat::P1689) { - llvm::errs() << "the --compilation-database option: must be specified at " - "least once!"; - return nullptr; - } - - // Trying to get the input file, the output file and the command line options - // from the positional parameter "--". - char **DoubleDash = std::find(argv, argv + argc, StringRef("--")); - if (DoubleDash == argv + argc) { - llvm::errs() << "The command line arguments is required after '--' in " - "P1689 per file mode."; - return nullptr; - } - llvm::IntrusiveRefCntPtr Diags = CompilerInstance::createDiagnostics(new DiagnosticOptions); driver::Driver TheDriver(CommandLine[0], llvm::sys::getDefaultTargetTriple(), *Diags); + TheDriver.setCheckInputsExist(false); std::unique_ptr C( TheDriver.BuildCompilation(CommandLine)); if (!C || C->getJobs().empty()) @@ -740,7 +723,8 @@ getCompilationDataBase(int argc, char **argv, std::string &ErrorMessage) { FrontendOptions &FEOpts = CI->getFrontendOpts(); if (FEOpts.Inputs.size() != 1) { - llvm::errs() << "Only one input file is allowed in P1689 per file mode."; + llvm::errs() + << "Exactly one input file is required in the per-file mode ('--').\n"; return nullptr; } @@ -749,8 +733,9 @@ getCompilationDataBase(int argc, char **argv, std::string &ErrorMessage) { auto LastCmd = C->getJobs().end(); LastCmd--; if (LastCmd->getOutputFilenames().size() != 1) { - llvm::errs() << "The command line should provide exactly one output file " - "in P1689 per file mode.\n"; + llvm::errs() + << "Exactly one output file is required in the per-file mode ('--').\n"; + return nullptr; } StringRef OutputFile = LastCmd->getOutputFilenames().front(); @@ -790,7 +775,7 @@ getCompilationDataBase(int argc, char **argv, std::string &ErrorMessage) { int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { std::string ErrorMessage; std::unique_ptr Compilations = - getCompilationDataBase(argc, argv, ErrorMessage); + getCompilationDatabase(argc, argv, ErrorMessage); if (!Compilations) { llvm::errs() << ErrorMessage << "\n"; return 1; From 6a4eaf9b33d8091b7d09b2a30a3fc8993a01db31 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Tue, 16 Apr 2024 19:49:07 -0700 Subject: [PATCH 202/300] [clang][deps] Add `-o` flag to specify output path (#88767) This makes it possible to pass "-o /dev/null" to `clang-scan-deps` and skip some potentially expensive work, making timings less noisy. Also removes the need for stream redirection. --- clang/test/ClangScanDeps/module-format.c | 2 +- clang/tools/clang-scan-deps/ClangScanDeps.cpp | 34 ++++++++++++++++--- clang/tools/clang-scan-deps/Opts.td | 4 ++- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/clang/test/ClangScanDeps/module-format.c b/clang/test/ClangScanDeps/module-format.c index 001a011ae0b597..0a6abec80dd909 100644 --- a/clang/test/ClangScanDeps/module-format.c +++ b/clang/test/ClangScanDeps/module-format.c @@ -16,7 +16,7 @@ // RUN: rm -f %t/cdb_pch.json // RUN: sed "s|DIR|%/t|g" %S/Inputs/modules-pch/cdb_pch.json > %t/cdb_pch.json // RUN: clang-scan-deps -compilation-database %t/cdb_pch.json -format experimental-full \ -// RUN: -module-files-dir %t/build > %t/result_pch.json +// RUN: -module-files-dir %t/build -o %t/result_pch.json // Explicitly build the PCH: // diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index 94510515cd4403..f42af7e330e17a 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -72,6 +72,7 @@ enum ResourceDirRecipeKind { RDRK_InvokeCompiler, }; +static std::string OutputFileName = "-"; static ScanningMode ScanMode = ScanningMode::DependencyDirectivesScan; static ScanningOutputFormat Format = ScanningOutputFormat::Make; static ScanningOptimizations OptimizeArgs; @@ -175,6 +176,9 @@ static void ParseArgs(int argc, char **argv) { if (const llvm::opt::Arg *A = Args.getLastArg(OPT_module_files_dir_EQ)) ModuleFilesDir = A->getValue(); + if (const llvm::opt::Arg *A = Args.getLastArg(OPT_o)) + OutputFileName = A->getValue(); + EagerLoadModules = Args.hasArg(OPT_eager_load_pcm); if (const llvm::opt::Arg *A = Args.getLastArg(OPT_j)) { @@ -419,6 +423,11 @@ class FullDeps { } void printFullOutput(raw_ostream &OS) { + // Skip sorting modules and constructing the JSON object if the output + // cannot be observed anyway. This makes timings less noisy. + if (&OS == &llvm::nulls()) + return; + // Sort the modules by name to get a deterministic order. std::vector ModuleIDs; for (auto &&M : Modules) @@ -849,8 +858,25 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { }); SharedStream Errs(llvm::errs()); - // Print out the dependency results to STDOUT by default. - SharedStream DependencyOS(llvm::outs()); + + std::optional FileOS; + llvm::raw_ostream &ThreadUnsafeDependencyOS = [&]() -> llvm::raw_ostream & { + if (OutputFileName == "-") + return llvm::outs(); + + if (OutputFileName == "/dev/null") + return llvm::nulls(); + + std::error_code EC; + FileOS.emplace(OutputFileName, EC); + if (EC) { + llvm::errs() << "Failed to open output file '" << OutputFileName + << "': " << llvm::errorCodeToError(EC) << '\n'; + std::exit(1); + } + return *FileOS; + }(); + SharedStream DependencyOS(ThreadUnsafeDependencyOS); std::vector Inputs = AdjustingCompilations->getAllCompileCommands(); @@ -991,9 +1017,9 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { HadErrors = true; if (Format == ScanningOutputFormat::Full) - FD->printFullOutput(llvm::outs()); + FD->printFullOutput(ThreadUnsafeDependencyOS); else if (Format == ScanningOutputFormat::P1689) - PD.printDependencies(llvm::outs()); + PD.printDependencies(ThreadUnsafeDependencyOS); return HadErrors; } diff --git a/clang/tools/clang-scan-deps/Opts.td b/clang/tools/clang-scan-deps/Opts.td index 5cd5d1a9fb37bc..4837ce6f070d73 100644 --- a/clang/tools/clang-scan-deps/Opts.td +++ b/clang/tools/clang-scan-deps/Opts.td @@ -11,6 +11,8 @@ multiclass Eq { def help : Flag<["--"], "help">, HelpText<"Display this help">; def version : Flag<["--"], "version">, HelpText<"Display the version">; +def o : Arg<"o", "Destination of the primary output">; + defm mode : Eq<"mode", "The preprocessing mode used to compute the dependencies">; defm format : Eq<"format", "The output format for the dependencies">; @@ -37,4 +39,4 @@ def verbose : F<"v", "Use verbose output">; def round_trip_args : F<"round-trip-args", "verify that command-line arguments are canonical by parsing and re-serializing">; -def DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>; \ No newline at end of file +def DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>; From f71e25bb669d662f98823d6d81b3f918538c9239 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 16 Apr 2024 20:35:34 -0700 Subject: [PATCH 203/300] [memprof] Simplify IndexedMemProfRecord::operator== (NFC) (#88986) llvm::SmallVector::operator== exactly meets our needs. --- llvm/include/llvm/ProfileData/MemProf.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index d43fb1c93bb8ef..7f3956bd739390 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -370,14 +370,9 @@ struct IndexedMemProfRecord { size_t serializedSize(IndexedVersion Version) const; bool operator==(const IndexedMemProfRecord &Other) const { - if (Other.AllocSites.size() != AllocSites.size()) + if (Other.AllocSites != AllocSites) return false; - for (size_t I = 0; I < AllocSites.size(); I++) { - if (AllocSites[I] != Other.AllocSites[I]) - return false; - } - if (Other.CallSiteIds != CallSiteIds) return false; return true; From fca2a493251597967d5d758ea0748c66dd29371a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 16 Apr 2024 21:46:57 -0700 Subject: [PATCH 204/300] [RISCV] Simplify FindRegWithEncoding in copyPhysRegVector. NFC (#89001) Instead of searching all encodings, we can convert the encoding back to a register and use getMatchingSuperReg. --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 14b5cbea71722f..8331fc0b8c3024 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -361,15 +361,12 @@ void RISCVInstrInfo::copyPhysRegVector( return {RISCVII::LMUL_1, RISCV::VRRegClass, RISCV::VMV1R_V, RISCV::PseudoVMV_V_V_M1, RISCV::PseudoVMV_V_I_M1}; }; - auto FindRegWithEncoding = [&TRI](const TargetRegisterClass &RegClass, - uint16_t Encoding) { - ArrayRef Regs = RegClass.getRegisters(); - const auto *FoundReg = llvm::find_if(Regs, [&](MCPhysReg Reg) { - return TRI->getEncodingValue(Reg) == Encoding; - }); - // We should be always able to find one valid register. - assert(FoundReg != Regs.end()); - return *FoundReg; + auto FindRegWithEncoding = [TRI](const TargetRegisterClass &RegClass, + uint16_t Encoding) { + MCRegister Reg = RISCV::V0 + Encoding; + if (&RegClass == &RISCV::VRRegClass) + return Reg; + return TRI->getMatchingSuperReg(Reg, RISCV::sub_vrm1_0, &RegClass); }; while (I != NumRegs) { // For non-segment copying, we only do this once as the registers are always From a6fcbcce8f79adfb2e4338859f3a41fc2538bad1 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Wed, 17 Apr 2024 07:59:43 +0200 Subject: [PATCH 205/300] [libc++][TZDB] Improves time zone format specifiers. (#85797) Per [tab:time.format.spec] %z The offset from UTC as specified in ISO 8601-1:2019, subclause 5.3.4.1. For example -0430 refers to 4 hours 30 minutes behind UTC. If the offset is zero, +0000 is used. The modified commands %Ez and %Oz insert a : between the hours and minutes: -04:30. If the offset information is not available, an exception of type format_error is thrown. Typically the modified versions Oz or Ez would have wording like The modified command %OS produces the locale's alternative representation. In this case the modified version does not depend on the locale. This change is a preparation for formatting sys_info which has time zone information. The function time_put<_CharT>::put() does not have proper time zone support, therefore it's a manual implementation. Fixes https://github.com/llvm/llvm-project/issues/78184 --- libcxx/include/__chrono/formatter.h | 50 ++++++++++++++++++- .../time.syn/formatter.file_time.pass.cpp | 39 ++------------- .../time/time.syn/formatter.sys_time.pass.cpp | 39 ++------------- 3 files changed, 56 insertions(+), 72 deletions(-) diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h index b64cae529a294d..d932a99f4b9983 100644 --- a/libcxx/include/__chrono/formatter.h +++ b/libcxx/include/__chrono/formatter.h @@ -10,6 +10,7 @@ #ifndef _LIBCPP___CHRONO_FORMATTER_H #define _LIBCPP___CHRONO_FORMATTER_H +#include <__algorithm/ranges_copy.h> #include <__chrono/calendar.h> #include <__chrono/concepts.h> #include <__chrono/convert_to_tm.h> @@ -170,10 +171,45 @@ _LIBCPP_HIDE_FROM_ABI void __format_century(basic_stringstream<_CharT>& __sstr, __sstr << std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "{:02}"), __century); } +// Implements the %z format specifier according to [tab:time.format.spec], where +// '__modifier' signals %Oz or %Ez were used. (Both modifiers behave the same, +// so there is no need to distinguish between them.) +template +_LIBCPP_HIDE_FROM_ABI void +__format_zone_offset(basic_stringstream<_CharT>& __sstr, chrono::seconds __offset, bool __modifier) { + if (__offset < 0s) { + __sstr << _CharT('-'); + __offset = -__offset; + } else { + __sstr << _CharT('+'); + } + + chrono::hh_mm_ss __hms{__offset}; + std::ostreambuf_iterator<_CharT> __out_it{__sstr}; + if (__modifier) + std::format_to(__out_it, _LIBCPP_STATICALLY_WIDEN(_CharT, "{:%H:%M}"), __hms); + else + std::format_to(__out_it, _LIBCPP_STATICALLY_WIDEN(_CharT, "{:%H%M}"), __hms); +} + +// Helper to store the time zone information needed for formatting. +struct _LIBCPP_HIDE_FROM_ABI __time_zone { + // Typically these abbreviations are short and fit in the string's internal + // buffer. + string __abbrev; + chrono::seconds __offset; +}; + +template +_LIBCPP_HIDE_FROM_ABI __time_zone __convert_to_time_zone([[maybe_unused]] const _Tp& __value) { + return {"UTC", chrono::seconds{0}}; +} + template _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs( basic_stringstream<_CharT>& __sstr, const _Tp& __value, basic_string_view<_CharT> __chrono_specs) { tm __t = std::__convert_to_tm(__value); + __time_zone __z = __formatter::__convert_to_time_zone(__value); const auto& __facet = std::use_facet>(__sstr.getloc()); for (auto __it = __chrono_specs.begin(); __it != __chrono_specs.end(); ++__it) { if (*__it == _CharT('%')) { @@ -296,9 +332,13 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs( {__sstr}, __sstr, _CharT(' '), std::addressof(__t), std::to_address(__s), std::to_address(__it + 1)); } break; + case _CharT('z'): + __formatter::__format_zone_offset(__sstr, __z.__offset, false); + break; + case _CharT('Z'): - // TODO FMT Add proper timezone support. - __sstr << _LIBCPP_STATICALLY_WIDEN(_CharT, "UTC"); + // __abbrev is always a char so the copy may convert. + ranges::copy(__z.__abbrev, std::ostreambuf_iterator<_CharT>{__sstr}); break; case _CharT('O'): @@ -314,9 +354,15 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs( break; } } + + // Oz produces the same output as Ez below. [[fallthrough]]; case _CharT('E'): ++__it; + if (*__it == 'z') { + __formatter::__format_zone_offset(__sstr, __z.__offset, true); + break; + } [[fallthrough]]; default: __facet.put( diff --git a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp index b07282593d759c..f57841cca86293 100644 --- a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp @@ -904,12 +904,6 @@ static void test_valid_values_date_time() { template static void test_valid_values_time_zone() { -// The Apple CI gives %z='-0700' %Ez='-0700' %Oz='-0700' %Z='UTC' -// -0700 looks like the local time where the CI happens to reside, therefore -// omit this test on Apple. -// The Windows CI gives %z='-0000', but on local machines set to a different -// timezone, it gives e.g. %z='+0200'. -#if !defined(__APPLE__) && !defined(_WIN32) using namespace std::literals::chrono_literals; constexpr std::basic_string_view fmt = SV("{:%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}"); @@ -918,48 +912,23 @@ static void test_valid_values_time_zone() { const std::locale loc(LOCALE_ja_JP_UTF_8); std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); -# if defined(_AIX) // Non localized output using C-locale - check(SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"), + check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"), fmt, file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 // Use the global locale (fr_FR) - check(SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"), + check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"), lfmt, file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 - // Use supplied locale (ja_JP). This locale has a different alternate.a + // Use supplied locale (ja_JP). check(loc, - SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"), - lfmt, - file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 -# else // defined(_AIX) - // Non localized output using C-locale - check(SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"), - fmt, - file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 - - // Use the global locale (fr_FR) - check(SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"), + SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"), lfmt, file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 - // Use supplied locale (ja_JP). This locale has a different alternate.a -# if defined(__FreeBSD__) - check(loc, - SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"), - lfmt, - file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 -# else - check(loc, - SV("%z='+0000'\t%Ez='+0000'\t%Oz='+〇'\t%Z='UTC'\n"), - lfmt, - file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 -# endif -# endif // defined(_AIX) std::locale::global(std::locale::classic()); -#endif // !defined(__APPLE__) && !defined(_WIN32) } template diff --git a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp index 2fed270cbade72..3a7d6f9a6b01fc 100644 --- a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp @@ -900,12 +900,6 @@ static void test_valid_values_date_time() { template static void test_valid_values_time_zone() { -// The Apple CI gives %z='-0700' %Ez='-0700' %Oz='-0700' %Z='UTC' -// -0700 looks like the local time where the CI happens to reside, therefore -// omit this test on Apple. -// The Windows CI gives %z='-0000', but on local machines set to a different -// timezone, it gives e.g. %z='+0200'. -#if !defined(__APPLE__) && !defined(_WIN32) using namespace std::literals::chrono_literals; constexpr std::basic_string_view fmt = SV("{:%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}"); @@ -914,48 +908,23 @@ static void test_valid_values_time_zone() { const std::locale loc(LOCALE_ja_JP_UTF_8); std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); -# if defined(_AIX) // Non localized output using C-locale - check(SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"), + check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"), fmt, std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 // Use the global locale (fr_FR) - check(SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"), + check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"), lfmt, std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 - // Use supplied locale (ja_JP). This locale has a different alternate.a + // Use supplied locale (ja_JP). check(loc, - SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"), - lfmt, - std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 -# else // defined(_AIX) - // Non localized output using C-locale - check(SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"), - fmt, - std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 - - // Use the global locale (fr_FR) - check(SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"), + SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"), lfmt, std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 - // Use supplied locale (ja_JP). This locale has a different alternate.a -# if defined(__FreeBSD__) - check(loc, - SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"), - lfmt, - std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 -# else - check(loc, - SV("%z='+0000'\t%Ez='+0000'\t%Oz='+〇'\t%Z='UTC'\n"), - lfmt, - std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970 -# endif -# endif // defined(_AIX) std::locale::global(std::locale::classic()); -#endif // !defined(__APPLE__) && !defined(_WIN32) } template From e096c144921daba59963f15e89d2ca6fb32d3a78 Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Wed, 17 Apr 2024 08:02:49 +0200 Subject: [PATCH 206/300] [analyzer] Fix a security.cert.env.InvalidPtr crash Fixes #88181 --- clang/docs/ReleaseNotes.rst | 2 ++ .../StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp | 6 +++++- clang/test/Analysis/invalid-ptr-checker.cpp | 10 ++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 clang/test/Analysis/invalid-ptr-checker.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index efc32212f300cf..6099f8ab02f443 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -679,6 +679,8 @@ Static Analyzer but not under any case blocks if ``unroll-loops=true`` analyzer config is set. (#GH68819) - Support C++23 static operator calls. (#GH84972) +- Fixed a crash in ``security.cert.env.InvalidPtr`` checker when accidentally + matched user-defined ``strerror`` and similar library functions. (GH#88181) New features ^^^^^^^^^^^^ diff --git a/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp index e5dd907c660d8e..b2947f590c4ec1 100644 --- a/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp @@ -205,8 +205,12 @@ void InvalidPtrChecker::postPreviousReturnInvalidatingCall( CE, LCtx, CE->getType(), C.blockCount()); State = State->BindExpr(CE, LCtx, RetVal); + const auto *SymRegOfRetVal = + dyn_cast_or_null(RetVal.getAsRegion()); + if (!SymRegOfRetVal) + return; + // Remember to this region. - const auto *SymRegOfRetVal = cast(RetVal.getAsRegion()); const MemRegion *MR = SymRegOfRetVal->getBaseRegion(); State = State->set(FD, MR); diff --git a/clang/test/Analysis/invalid-ptr-checker.cpp b/clang/test/Analysis/invalid-ptr-checker.cpp new file mode 100644 index 00000000000000..58bb45e0fb8421 --- /dev/null +++ b/clang/test/Analysis/invalid-ptr-checker.cpp @@ -0,0 +1,10 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=core,security.cert.env.InvalidPtr -verify %s + +// expected-no-diagnostics + +namespace other { +int strerror(int errnum); // custom strerror +void no_crash_on_custom_strerror() { + (void)strerror(0); // no-crash +} +} // namespace other From 024281d4d26344f9613b9115ea1fcbdbdba23235 Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Wed, 17 Apr 2024 08:02:49 +0200 Subject: [PATCH 207/300] [analyzer] Harden security.cert.env.InvalidPtr checker fn matching Relates to #88181 --- .../Checkers/cert/InvalidPtrChecker.cpp | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp index b2947f590c4ec1..fefe846b6911f7 100644 --- a/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp @@ -48,14 +48,19 @@ class InvalidPtrChecker bool InvalidatingGetEnv = false; // GetEnv can be treated invalidating and non-invalidating as well. - const CallDescription GetEnvCall{{"getenv"}, 1}; + const CallDescription GetEnvCall{CDM::CLibrary, {"getenv"}, 1}; const CallDescriptionMap EnvpInvalidatingFunctions = { - {{{"setenv"}, 3}, &InvalidPtrChecker::EnvpInvalidatingCall}, - {{{"unsetenv"}, 1}, &InvalidPtrChecker::EnvpInvalidatingCall}, - {{{"putenv"}, 1}, &InvalidPtrChecker::EnvpInvalidatingCall}, - {{{"_putenv_s"}, 2}, &InvalidPtrChecker::EnvpInvalidatingCall}, - {{{"_wputenv_s"}, 2}, &InvalidPtrChecker::EnvpInvalidatingCall}, + {{CDM::CLibrary, {"setenv"}, 3}, + &InvalidPtrChecker::EnvpInvalidatingCall}, + {{CDM::CLibrary, {"unsetenv"}, 1}, + &InvalidPtrChecker::EnvpInvalidatingCall}, + {{CDM::CLibrary, {"putenv"}, 1}, + &InvalidPtrChecker::EnvpInvalidatingCall}, + {{CDM::CLibrary, {"_putenv_s"}, 2}, + &InvalidPtrChecker::EnvpInvalidatingCall}, + {{CDM::CLibrary, {"_wputenv_s"}, 2}, + &InvalidPtrChecker::EnvpInvalidatingCall}, }; void postPreviousReturnInvalidatingCall(const CallEvent &Call, @@ -63,13 +68,13 @@ class InvalidPtrChecker // SEI CERT ENV34-C const CallDescriptionMap PreviousCallInvalidatingFunctions = { - {{{"setlocale"}, 2}, + {{CDM::CLibrary, {"setlocale"}, 2}, &InvalidPtrChecker::postPreviousReturnInvalidatingCall}, - {{{"strerror"}, 1}, + {{CDM::CLibrary, {"strerror"}, 1}, &InvalidPtrChecker::postPreviousReturnInvalidatingCall}, - {{{"localeconv"}, 0}, + {{CDM::CLibrary, {"localeconv"}, 0}, &InvalidPtrChecker::postPreviousReturnInvalidatingCall}, - {{{"asctime"}, 1}, + {{CDM::CLibrary, {"asctime"}, 1}, &InvalidPtrChecker::postPreviousReturnInvalidatingCall}, }; From b851c7f1fc4fd83ea84d565bbdc30fd0d356788c Mon Sep 17 00:00:00 2001 From: martinboehme Date: Wed, 17 Apr 2024 08:05:43 +0200 Subject: [PATCH 208/300] [clang][dataflow] Support `StmtExpr` in `PropagateResultObject()`. (#88872) This patch adds a test that assert-fails without the fix. --- .../FlowSensitive/DataflowEnvironment.cpp | 5 ++++ .../Analysis/FlowSensitive/TransferTest.cpp | 26 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp index 3bf3807268bee9..f2b4a67e5bc97b 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp @@ -470,6 +470,11 @@ class ResultObjectVisitor : public RecursiveASTVisitor { return; } + if (auto *SE = dyn_cast(E)) { + PropagateResultObject(cast(SE->getSubStmt()->body_back()), Loc); + return; + } + // All other expression nodes that propagate a record prvalue should have // exactly one child. SmallVector Children(E->child_begin(), E->child_end()); diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp index d8bcc3da4b8b1c..d7a51b009712f6 100644 --- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp @@ -3182,6 +3182,32 @@ TEST(TransferTest, ResultObjectLocationForStdInitializerListExpr) { }); } +TEST(TransferTest, ResultObjectLocationForStmtExpr) { + std::string Code = R"( + struct S {}; + void target() { + S s = ({ S(); }); + // [[p]] + } + )"; + using ast_matchers::cxxConstructExpr; + using ast_matchers::match; + using ast_matchers::selectFirst; + using ast_matchers::traverse; + runDataflow( + Code, + [](const llvm::StringMap> &Results, + ASTContext &ASTCtx) { + const Environment &Env = getEnvironmentAtAnnotation(Results, "p"); + + auto *Construct = selectFirst( + "construct", match(cxxConstructExpr().bind("construct"), ASTCtx)); + + EXPECT_EQ(&Env.getResultObjectLocation(*Construct), + &getLocForDecl(ASTCtx, Env, "s")); + }); +} + TEST(TransferTest, ResultObjectLocationPropagatesThroughConditionalOperator) { std::string Code = R"( struct A { From 47148832d4e3bf4901430732f1af6673147accb2 Mon Sep 17 00:00:00 2001 From: Hideto Ueno Date: Wed, 17 Apr 2024 15:09:47 +0900 Subject: [PATCH 209/300] [mlir][python] Add `walk` method to PyOperationBase (#87962) This commit adds `walk` method to PyOperationBase that uses a python object as a callback, e.g. `op.walk(callback)`. Currently callback must return a walk result explicitly. We(SiFive) have implemented walk method with python in our internal python tool for a while. However the overhead of python is expensive and it didn't scale well for large MLIR files. Just replacing walk with this version reduced the entire execution time of the tool by 30~40% and there are a few configs that the tool takes several hours to finish so this commit significantly improves tool performance. --- mlir/include/mlir-c/IR.h | 10 ++- .../mlir/Bindings/Python/PybindAdaptors.h | 1 + mlir/lib/Bindings/Python/IRCore.cpp | 32 +++++++- mlir/lib/Bindings/Python/IRModule.h | 4 + mlir/lib/CAPI/IR/IR.cpp | 21 +++++- mlir/test/CAPI/ir.c | 58 +++++++++++--- mlir/test/python/ir/operation.py | 75 +++++++++++++++++++ 7 files changed, 184 insertions(+), 17 deletions(-) diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h index 82da511f807a34..32abacf353133e 100644 --- a/mlir/include/mlir-c/IR.h +++ b/mlir/include/mlir-c/IR.h @@ -705,6 +705,13 @@ MLIR_CAPI_EXPORTED void mlirOperationMoveAfter(MlirOperation op, MLIR_CAPI_EXPORTED void mlirOperationMoveBefore(MlirOperation op, MlirOperation other); +/// Operation walk result. +typedef enum MlirWalkResult { + MlirWalkResultAdvance, + MlirWalkResultInterrupt, + MlirWalkResultSkip +} MlirWalkResult; + /// Traversal order for operation walk. typedef enum MlirWalkOrder { MlirWalkPreOrder, @@ -713,7 +720,8 @@ typedef enum MlirWalkOrder { /// Operation walker type. The handler is passed an (opaque) reference to an /// operation and a pointer to a `userData`. -typedef void (*MlirOperationWalkCallback)(MlirOperation, void *userData); +typedef MlirWalkResult (*MlirOperationWalkCallback)(MlirOperation, + void *userData); /// Walks operation `op` in `walkOrder` and calls `callback` on that operation. /// `*userData` is passed to the callback as well and can be used to tunnel some diff --git a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h index 52f6321251919e..d8f22c7aa17096 100644 --- a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h +++ b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h @@ -18,6 +18,7 @@ #ifndef MLIR_BINDINGS_PYTHON_PYBINDADAPTORS_H #define MLIR_BINDINGS_PYTHON_PYBINDADAPTORS_H +#include #include #include #include diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 734f2f7f3f94cf..d875f4eba2b139 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -674,6 +674,7 @@ void PyMlirContext::clearOperationsInside(PyOperationBase &op) { data->rootOp.getOperation().getContext()->clearOperation(op); else data->rootSeen = true; + return MlirWalkResult::MlirWalkResultAdvance; }; mlirOperationWalk(op.getOperation(), invalidatingCallback, static_cast(&data), MlirWalkPreOrder); @@ -1249,6 +1250,21 @@ void PyOperationBase::writeBytecode(const py::object &fileObject, .str()); } +void PyOperationBase::walk( + std::function callback, + MlirWalkOrder walkOrder) { + PyOperation &operation = getOperation(); + operation.checkValid(); + MlirOperationWalkCallback walkCallback = [](MlirOperation op, + void *userData) { + auto *fn = + static_cast *>(userData); + return (*fn)(op); + }; + + mlirOperationWalk(operation, walkCallback, &callback, walkOrder); +} + py::object PyOperationBase::getAsm(bool binary, std::optional largeElementsLimit, bool enableDebugInfo, bool prettyDebugInfo, @@ -2511,6 +2527,15 @@ void mlir::python::populateIRCore(py::module &m) { .value("NOTE", MlirDiagnosticNote) .value("REMARK", MlirDiagnosticRemark); + py::enum_(m, "WalkOrder", py::module_local()) + .value("PRE_ORDER", MlirWalkPreOrder) + .value("POST_ORDER", MlirWalkPostOrder); + + py::enum_(m, "WalkResult", py::module_local()) + .value("ADVANCE", MlirWalkResultAdvance) + .value("INTERRUPT", MlirWalkResultInterrupt) + .value("SKIP", MlirWalkResultSkip); + //---------------------------------------------------------------------------- // Mapping of Diagnostics. //---------------------------------------------------------------------------- @@ -2989,8 +3014,7 @@ void mlir::python::populateIRCore(py::module &m) { py::arg("binary") = false, kOperationPrintStateDocstring) .def("print", py::overload_cast, bool, bool, bool, bool, - bool, py::object, bool>( - &PyOperationBase::print), + bool, py::object, bool>(&PyOperationBase::print), // Careful: Lots of arguments must match up with print method. py::arg("large_elements_limit") = py::none(), py::arg("enable_debug_info") = false, @@ -3038,7 +3062,9 @@ void mlir::python::populateIRCore(py::module &m) { return operation.createOpView(); }, "Detaches the operation from its parent block.") - .def("erase", [](PyOperationBase &self) { self.getOperation().erase(); }); + .def("erase", [](PyOperationBase &self) { self.getOperation().erase(); }) + .def("walk", &PyOperationBase::walk, py::arg("callback"), + py::arg("walk_order") = MlirWalkPostOrder); py::class_(m, "Operation", py::module_local()) .def_static("create", &PyOperation::create, py::arg("name"), diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index 9acfdde25ae047..b038a0c54d29b9 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -579,6 +579,10 @@ class PyOperationBase { void writeBytecode(const pybind11::object &fileObject, std::optional bytecodeVersion); + // Implement the walk method. + void walk(std::function callback, + MlirWalkOrder walkOrder); + /// Moves the operation before or after the other operation. void moveAfter(PyOperationBase &other); void moveBefore(PyOperationBase &other); diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp index cdb64f4ec4a40f..a72cd247e73f60 100644 --- a/mlir/lib/CAPI/IR/IR.cpp +++ b/mlir/lib/CAPI/IR/IR.cpp @@ -717,17 +717,34 @@ void mlirOperationMoveBefore(MlirOperation op, MlirOperation other) { return unwrap(op)->moveBefore(unwrap(other)); } +static mlir::WalkResult unwrap(MlirWalkResult result) { + switch (result) { + case MlirWalkResultAdvance: + return mlir::WalkResult::advance(); + + case MlirWalkResultInterrupt: + return mlir::WalkResult::interrupt(); + + case MlirWalkResultSkip: + return mlir::WalkResult::skip(); + } +} + void mlirOperationWalk(MlirOperation op, MlirOperationWalkCallback callback, void *userData, MlirWalkOrder walkOrder) { switch (walkOrder) { case MlirWalkPreOrder: unwrap(op)->walk( - [callback, userData](Operation *op) { callback(wrap(op), userData); }); + [callback, userData](Operation *op) { + return unwrap(callback(wrap(op), userData)); + }); break; case MlirWalkPostOrder: unwrap(op)->walk( - [callback, userData](Operation *op) { callback(wrap(op), userData); }); + [callback, userData](Operation *op) { + return unwrap(callback(wrap(op), userData)); + }); } } diff --git a/mlir/test/CAPI/ir.c b/mlir/test/CAPI/ir.c index 8e79338c57a22a..3d05b2a12dd8ef 100644 --- a/mlir/test/CAPI/ir.c +++ b/mlir/test/CAPI/ir.c @@ -2244,9 +2244,22 @@ typedef struct { const char *x; } callBackData; -void walkCallBack(MlirOperation op, void *rootOpVoid) { +MlirWalkResult walkCallBack(MlirOperation op, void *rootOpVoid) { fprintf(stderr, "%s: %s\n", ((callBackData *)(rootOpVoid))->x, mlirIdentifierStr(mlirOperationGetName(op)).data); + return MlirWalkResultAdvance; +} + +MlirWalkResult walkCallBackTestWalkResult(MlirOperation op, void *rootOpVoid) { + fprintf(stderr, "%s: %s\n", ((callBackData *)(rootOpVoid))->x, + mlirIdentifierStr(mlirOperationGetName(op)).data); + if (strcmp(mlirIdentifierStr(mlirOperationGetName(op)).data, "func.func") == + 0) + return MlirWalkResultSkip; + if (strcmp(mlirIdentifierStr(mlirOperationGetName(op)).data, "arith.addi") == + 0) + return MlirWalkResultInterrupt; + return MlirWalkResultAdvance; } int testOperationWalk(MlirContext ctx) { @@ -2259,6 +2272,9 @@ int testOperationWalk(MlirContext ctx) { " arith.addi %1, %1: i32\n" " return\n" " }\n" + " func.func @bar() {\n" + " return\n" + " }\n" "}"; MlirModule module = mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(moduleString)); @@ -2266,22 +2282,42 @@ int testOperationWalk(MlirContext ctx) { callBackData data; data.x = "i love you"; - // CHECK: i love you: arith.constant - // CHECK: i love you: arith.addi - // CHECK: i love you: func.return - // CHECK: i love you: func.func - // CHECK: i love you: builtin.module + // CHECK-NEXT: i love you: arith.constant + // CHECK-NEXT: i love you: arith.addi + // CHECK-NEXT: i love you: func.return + // CHECK-NEXT: i love you: func.func + // CHECK-NEXT: i love you: func.return + // CHECK-NEXT: i love you: func.func + // CHECK-NEXT: i love you: builtin.module mlirOperationWalk(mlirModuleGetOperation(module), walkCallBack, (void *)(&data), MlirWalkPostOrder); data.x = "i don't love you"; - // CHECK: i don't love you: builtin.module - // CHECK: i don't love you: func.func - // CHECK: i don't love you: arith.constant - // CHECK: i don't love you: arith.addi - // CHECK: i don't love you: func.return + // CHECK-NEXT: i don't love you: builtin.module + // CHECK-NEXT: i don't love you: func.func + // CHECK-NEXT: i don't love you: arith.constant + // CHECK-NEXT: i don't love you: arith.addi + // CHECK-NEXT: i don't love you: func.return + // CHECK-NEXT: i don't love you: func.func + // CHECK-NEXT: i don't love you: func.return mlirOperationWalk(mlirModuleGetOperation(module), walkCallBack, (void *)(&data), MlirWalkPreOrder); + + data.x = "interrupt"; + // Interrupted at `arith.addi` + // CHECK-NEXT: interrupt: arith.constant + // CHECK-NEXT: interrupt: arith.addi + mlirOperationWalk(mlirModuleGetOperation(module), walkCallBackTestWalkResult, + (void *)(&data), MlirWalkPostOrder); + + data.x = "skip"; + // Skip at `func.func` + // CHECK-NEXT: skip: builtin.module + // CHECK-NEXT: skip: func.func + // CHECK-NEXT: skip: func.func + mlirOperationWalk(mlirModuleGetOperation(module), walkCallBackTestWalkResult, + (void *)(&data), MlirWalkPreOrder); + mlirModuleDestroy(module); return 0; } diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py index 04f8a9936e31f7..9666e63bda1e0e 100644 --- a/mlir/test/python/ir/operation.py +++ b/mlir/test/python/ir/operation.py @@ -1015,3 +1015,78 @@ def testOperationParse(): print( f"op_with_source_name: {o.get_asm(enable_debug_info=True, use_local_scope=True)}" ) + + +# CHECK-LABEL: TEST: testOpWalk +@run +def testOpWalk(): + ctx = Context() + ctx.allow_unregistered_dialects = True + module = Module.parse( + r""" + builtin.module { + func.func @f() { + func.return + } + } + """, + ctx, + ) + + def callback(op): + print(op.name) + return WalkResult.ADVANCE + + # Test post-order walk (default). + # CHECK-NEXT: Post-order + # CHECK-NEXT: func.return + # CHECK-NEXT: func.func + # CHECK-NEXT: builtin.module + print("Post-order") + module.operation.walk(callback) + + # Test pre-order walk. + # CHECK-NEXT: Pre-order + # CHECK-NEXT: builtin.module + # CHECK-NEXT: func.fun + # CHECK-NEXT: func.return + print("Pre-order") + module.operation.walk(callback, WalkOrder.PRE_ORDER) + + # Test interrput. + # CHECK-NEXT: Interrupt post-order + # CHECK-NEXT: func.return + print("Interrupt post-order") + + def callback(op): + print(op.name) + return WalkResult.INTERRUPT + + module.operation.walk(callback) + + # Test skip. + # CHECK-NEXT: Skip pre-order + # CHECK-NEXT: builtin.module + print("Skip pre-order") + + def callback(op): + print(op.name) + return WalkResult.SKIP + + module.operation.walk(callback, WalkOrder.PRE_ORDER) + + # Test exception. + # CHECK: Exception + # CHECK-NEXT: func.return + # CHECK-NEXT: Exception raised + print("Exception") + + def callback(op): + print(op.name) + raise ValueError + return WalkResult.ADVANCE + + try: + module.operation.walk(callback) + except ValueError: + print("Exception raised") From 1bccbe1f49abc39b9f980cf3f1b171da5541d1a4 Mon Sep 17 00:00:00 2001 From: martinboehme Date: Wed, 17 Apr 2024 08:17:56 +0200 Subject: [PATCH 210/300] [clang][dataflow] Treat `BuiltinBitCastExpr` correctly in `PropagateResultObject()`. (#88875) This patch includes a test that assert-fails without the fix. --- .../FlowSensitive/DataflowEnvironment.cpp | 6 ++++- .../Analysis/FlowSensitive/TransferTest.cpp | 26 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp index f2b4a67e5bc97b..3f1600d9ac5d87 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp @@ -419,7 +419,11 @@ class ResultObjectVisitor : public RecursiveASTVisitor { // below them can initialize the same object (or part of it). if (isa(E) || isa(E) || isa(E) || isa(E) || isa(E) || - isa(E)) { + isa(E) || + // We treat `BuiltinBitCastExpr` as an "original initializer" too as + // it may not even be casting from a record type -- and even if it is, + // the two objects are in general of unrelated type. + isa(E)) { return; } if (auto *Op = dyn_cast(E); diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp index d7a51b009712f6..97ec32126c1dc4 100644 --- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp @@ -3208,6 +3208,32 @@ TEST(TransferTest, ResultObjectLocationForStmtExpr) { }); } +TEST(TransferTest, ResultObjectLocationForBuiltinBitCastExpr) { + std::string Code = R"( + struct S { int i; }; + void target(int i) { + S s = __builtin_bit_cast(S, i); + // [[p]] + } + )"; + using ast_matchers::explicitCastExpr; + using ast_matchers::match; + using ast_matchers::selectFirst; + using ast_matchers::traverse; + runDataflow( + Code, + [](const llvm::StringMap> &Results, + ASTContext &ASTCtx) { + const Environment &Env = getEnvironmentAtAnnotation(Results, "p"); + + auto *BuiltinBitCast = selectFirst( + "cast", match(explicitCastExpr().bind("cast"), ASTCtx)); + + EXPECT_EQ(&Env.getResultObjectLocation(*BuiltinBitCast), + &getLocForDecl(ASTCtx, Env, "s")); + }); +} + TEST(TransferTest, ResultObjectLocationPropagatesThroughConditionalOperator) { std::string Code = R"( struct A { From 64c649585ca23a0c996d8814d2796cd348441d69 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Wed, 17 Apr 2024 09:20:55 +0300 Subject: [PATCH 211/300] [clang][NFC] Move `Sema::SkipBodyInfo` into namespace scope This makes it forward-declarable, and needed from splitting `Sema` up. --- clang/include/clang/Sema/Sema.h | 16 ++++++++-------- clang/lib/Parse/ParseDecl.cpp | 2 +- clang/lib/Parse/ParseDeclCXX.cpp | 2 +- clang/lib/Parse/ParseObjc.cpp | 4 ++-- clang/lib/Parse/Parser.cpp | 2 +- clang/lib/Sema/SemaDecl.cpp | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 77150a318ee47d..091c5c02f75df9 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -352,6 +352,14 @@ class PreferredTypeBuilder { llvm::function_ref ComputeType; }; +struct SkipBodyInfo { + SkipBodyInfo() = default; + bool ShouldSkip = false; + bool CheckSameAsPrevious = false; + NamedDecl *Previous = nullptr; + NamedDecl *New = nullptr; +}; + /// Describes the result of template argument deduction. /// /// The TemplateDeductionResult enumeration describes the result of @@ -2627,14 +2635,6 @@ class Sema final : public SemaBase { return Entity->getOwningModule(); } - struct SkipBodyInfo { - SkipBodyInfo() = default; - bool ShouldSkip = false; - bool CheckSameAsPrevious = false; - NamedDecl *Previous = nullptr; - NamedDecl *New = nullptr; - }; - DeclGroupPtrTy ConvertDeclToDeclGroup(Decl *Ptr, Decl *OwnedType = nullptr); ParsedType getTypeName(const IdentifierInfo &II, SourceLocation NameLoc, diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index c881b37507771a..274ee7b10c1787 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -5331,7 +5331,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS, stripTypeAttributesOffDeclSpec(attrs, DS, TUK); - Sema::SkipBodyInfo SkipBody; + SkipBodyInfo SkipBody; if (!Name && TUK == Sema::TUK_Definition && Tok.is(tok::l_brace) && NextToken().is(tok::identifier)) SkipBody = Actions.shouldSkipAnonEnumBody(getCurScope(), diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index cd4803d51bc1de..51fd64b2d01aa7 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -2092,7 +2092,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind, TypeResult TypeResult = true; // invalid bool Owned = false; - Sema::SkipBodyInfo SkipBody; + SkipBodyInfo SkipBody; if (TemplateId) { // Explicit specialization, class template partial specialization, // or explicit instantiation. diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp index 887d7a36cee7e9..671dcb71e51a37 100644 --- a/clang/lib/Parse/ParseObjc.cpp +++ b/clang/lib/Parse/ParseObjc.cpp @@ -375,7 +375,7 @@ Decl *Parser::ParseObjCAtInterfaceDeclaration(SourceLocation AtLoc, Actions.ActOnTypedefedProtocols(protocols, protocolLocs, superClassId, superClassLoc); - Sema::SkipBodyInfo SkipBody; + SkipBodyInfo SkipBody; ObjCInterfaceDecl *ClsType = Actions.ActOnStartClassInterface( getCurScope(), AtLoc, nameId, nameLoc, typeParameterList, superClassId, superClassLoc, typeArgs, @@ -2133,7 +2133,7 @@ Parser::ParseObjCAtProtocolDeclaration(SourceLocation AtLoc, /*consumeLastToken=*/true)) return nullptr; - Sema::SkipBodyInfo SkipBody; + SkipBodyInfo SkipBody; ObjCProtocolDecl *ProtoType = Actions.ActOnStartProtocolInterface( AtLoc, protocolName, nameLoc, ProtocolRefs.data(), ProtocolRefs.size(), ProtocolLocs.data(), EndProtoLoc, attrs, &SkipBody); diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index d6f2b9f448cd52..ef46fc74cedc14 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -1441,7 +1441,7 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D, // Tell the actions module that we have entered a function definition with the // specified Declarator for the function. - Sema::SkipBodyInfo SkipBody; + SkipBodyInfo SkipBody; Decl *Res = Actions.ActOnStartOfFunctionDef(getCurScope(), D, TemplateInfo.TemplateParams ? *TemplateInfo.TemplateParams diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 745cf41e204e7a..19abd5327b73aa 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -3037,7 +3037,7 @@ static void checkNewAttributesAfterDef(Sema &S, Decl *New, const Decl *Old) { if (isa(NewAttribute) || isa(NewAttribute)) { if (FunctionDecl *FD = dyn_cast(New)) { - Sema::SkipBodyInfo SkipBody; + SkipBodyInfo SkipBody; S.CheckForFunctionRedefinition(FD, cast(Def), &SkipBody); // If we're skipping this definition, drop the "alias" attribute. @@ -19999,7 +19999,7 @@ EnumConstantDecl *Sema::CheckEnumConstant(EnumDecl *Enum, Val, EnumVal); } -Sema::SkipBodyInfo Sema::shouldSkipAnonEnumBody(Scope *S, IdentifierInfo *II, +SkipBodyInfo Sema::shouldSkipAnonEnumBody(Scope *S, IdentifierInfo *II, SourceLocation IILoc) { if (!(getLangOpts().Modules || getLangOpts().ModulesLocalVisibility) || !getLangOpts().CPlusPlus) From 16f188761da1df6ba5e6627b8742aacfec8e9ec5 Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Wed, 17 Apr 2024 14:23:37 +0800 Subject: [PATCH 212/300] CompilerRT: Normalize COMPILER_RT_DEFAULT_TARGET_TRIPLE (#88835) If LLVM is configured with -DLLVM_DEFAULT_TARGET_TRIPLE, or compiler_rt is configured with -DCOMPILER_RT_DEFAULT_TARGET_TRIPLE, while the argument is not normalized, such as Debian-style vendor-less triple, clang will try to find libclang_rt in lib/, while libclang_rt is placed into lib/. Let's also place libclang_rt into lib/. --- compiler-rt/cmake/Modules/CompilerRTUtils.cmake | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake index e8e5f612d5b03c..6d413f6753bc0c 100644 --- a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake +++ b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake @@ -368,6 +368,12 @@ macro(construct_compiler_rt_default_triple) "Default triple for which compiler-rt runtimes will be built.") endif() + if ("${CMAKE_C_COMPILER_ID}" MATCHES "Clang") + execute_process(COMMAND ${CMAKE_C_COMPILER} --target=${COMPILER_RT_DEFAULT_TARGET_TRIPLE} -print-effective-triple + OUTPUT_VARIABLE COMPILER_RT_DEFAULT_TARGET_TRIPLE + OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() + string(REPLACE "-" ";" LLVM_TARGET_TRIPLE_LIST ${COMPILER_RT_DEFAULT_TARGET_TRIPLE}) list(GET LLVM_TARGET_TRIPLE_LIST 0 COMPILER_RT_DEFAULT_TARGET_ARCH) From b090569685699abe4a8031ad442a0f81e373146b Mon Sep 17 00:00:00 2001 From: Jesse Huang Date: Tue, 16 Apr 2024 23:36:27 -0700 Subject: [PATCH 213/300] [RISCV] Support Zama16b1p0 (#88474) This patch adds the support for Zama16b version 1.0, which has been added to RVA23U64 optional extensions recently --- clang/test/Preprocessor/riscv-target-features.c | 7 +++++++ llvm/docs/RISCVUsage.rst | 3 ++- llvm/lib/Support/RISCVISAInfo.cpp | 1 + llvm/lib/Target/RISCV/RISCVFeatures.td | 7 +++++++ llvm/test/CodeGen/RISCV/attributes.ll | 4 ++++ llvm/test/MC/RISCV/attribute-arch.s | 3 +++ llvm/unittests/Support/RISCVISAInfoTest.cpp | 1 + 7 files changed, 25 insertions(+), 1 deletion(-) diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c index ec7764bb538189..646043681fe330 100644 --- a/clang/test/Preprocessor/riscv-target-features.c +++ b/clang/test/Preprocessor/riscv-target-features.c @@ -79,6 +79,7 @@ // CHECK-NOT: __riscv_za128rs {{.*$}} // CHECK-NOT: __riscv_za64rs {{.*$}} // CHECK-NOT: __riscv_zacas {{.*$}} +// CHECK-NOT: __riscv_zama16b {{.*$}} // CHECK-NOT: __riscv_zawrs {{.*$}} // CHECK-NOT: __riscv_zba {{.*$}} // CHECK-NOT: __riscv_zbb {{.*$}} @@ -704,6 +705,12 @@ // RUN: -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s // CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}} +// RUN: %clang --target=riscv32 -march=rv32izama16b -x c -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-ZAMA16B-EXT %s +// RUN: %clang --target=riscv64 -march=rv64izama16b -x c -E -dM %s \ +// RUN: -o - | FileCheck --check-prefix=CHECK-ZAMA16B-EXT %s +// CHECK-ZAMA16B-EXT: __riscv_zama16b 1000000{{$}} + // RUN: %clang --target=riscv32-unknown-linux-gnu \ // RUN: -march=rv32izawrs -E -dM %s \ // RUN: -o - | FileCheck --check-prefix=CHECK-ZAWRS-EXT %s diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 6f5eba263def43..a4cf17a8398a82 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -119,6 +119,7 @@ on support follow. ``Za128rs`` Supported (`See note <#riscv-profiles-extensions-note>`__) ``Za64rs`` Supported (`See note <#riscv-profiles-extensions-note>`__) ``Zacas`` Supported (`See note <#riscv-zacas-note>`__) + ``Zama16b`` Supported (`See note <#riscv-profiles-extensions-note>`__) ``Zawrs`` Assembly Support ``Zba`` Supported ``Zbb`` Supported @@ -237,7 +238,7 @@ Supported .. _riscv-profiles-extensions-note: -``Za128rs``, ``Za64rs``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare`` +``Za128rs``, ``Za64rs``, ``Zama16b``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare`` These extensions are defined as part of the `RISC-V Profiles specification `__. They do not introduce any new features themselves, but instead describe existing hardware features. .. _riscv-zacas-note: diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp index cbdc64bc7a97be..fa967403ea449c 100644 --- a/llvm/lib/Support/RISCVISAInfo.cpp +++ b/llvm/lib/Support/RISCVISAInfo.cpp @@ -119,6 +119,7 @@ static const RISCVSupportedExtension SupportedExtensions[] = { {"za128rs", {1, 0}}, {"za64rs", {1, 0}}, {"zacas", {1, 0}}, + {"zama16b", {1, 0}}, {"zawrs", {1, 0}}, {"zba", {1, 0}}, diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 561187c39a4a04..f830ead5dd692a 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -208,6 +208,13 @@ def HasStdExtAOrZalrsc "'A' (Atomic Instructions) or " "'Zalrsc' (Load-Reserved/Store-Conditional)">; +def FeatureStdExtZama16b + : SubtargetFeature<"zama16b", "HasStdExtZama16b", "true", + "'Zama16b' (Atomic 16-byte misaligned loads, stores and AMOs)">; +def HasStdExtZama16b : Predicate<"Subtarget->hasStdExtZama16b()">, + AssemblerPredicate<(all_of FeatureStdExtZama16b), + "'Zama16b' (Atomic 16-byte misaligned loads, stores and AMOs)">; + def FeatureStdExtZawrs : SubtargetFeature<"zawrs", "HasStdExtZawrs", "true", "'Zawrs' (Wait on Reservation Set)">; def HasStdExtZawrs : Predicate<"Subtarget->hasStdExtZawrs()">, diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 2326599bf35136..080783fdeec024 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -115,6 +115,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+zacas %s -o - | FileCheck --check-prefix=RV32ZACAS %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zalasr %s -o - | FileCheck --check-prefix=RV32ZALASR %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zalrsc %s -o - | FileCheck --check-prefix=RV32ZALRSC %s +; RUN: llc -mtriple=riscv32 -mattr=+zama16b %s -o - | FileCheck --check-prefixes=CHECK,RV32ZAMA16B %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfilp %s -o - | FileCheck --check-prefix=RV32ZICFILP %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zabha %s -o - | FileCheck --check-prefix=RV32ZABHA %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-ssnpm %s -o - | FileCheck --check-prefix=RV32SSNPM %s @@ -199,6 +200,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+xtheadvdot %s -o - | FileCheck --check-prefixes=CHECK,RV64XTHEADVDOT %s ; RUN: llc -mtriple=riscv64 -mattr=+za64rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA64RS %s ; RUN: llc -mtriple=riscv64 -mattr=+za128rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA128RS %s +; RUN: llc -mtriple=riscv64 -mattr=+zama16b %s -o - | FileCheck --check-prefixes=CHECK,RV64ZAMA16B %s ; RUN: llc -mtriple=riscv64 -mattr=+zawrs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZAWRS %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-ztso %s -o - | FileCheck --check-prefixes=CHECK,RV64ZTSO %s ; RUN: llc -mtriple=riscv64 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCA %s @@ -370,6 +372,7 @@ ; RV32ZACAS: .attribute 5, "rv32i2p1_a2p1_zacas1p0" ; RV32ZALASR: .attribute 5, "rv32i2p1_zalasr0p1" ; RV32ZALRSC: .attribute 5, "rv32i2p1_zalrsc0p2" +; RV32ZAMA16B: .attribute 5, "rv32i2p1_zama16b1p0" ; RV32ZICFILP: .attribute 5, "rv32i2p1_zicfilp0p4" ; RV32ZABHA: .attribute 5, "rv32i2p1_a2p1_zabha1p0" ; RV32SSNPM: .attribute 5, "rv32i2p1_ssnpm0p8" @@ -418,6 +421,7 @@ ; RV64ZICBOZ: .attribute 5, "rv64i2p1_zicboz1p0" ; RV64ZA64RS: .attribute 5, "rv64i2p1_za64rs1p0" ; RV64ZA128RS: .attribute 5, "rv64i2p1_za128rs1p0" +; RV64ZAMA16B: .attribute 5, "rv64i2p1_zama16b1p0" ; RV64ZAWRS: .attribute 5, "rv64i2p1_zawrs1p0" ; RV64ZICBOP: .attribute 5, "rv64i2p1_zicbop1p0" ; RV64SHCOUNTERENW: .attribute 5, "rv64i2p1_shcounterenw1p0" diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s index a8f493f781ec3d..8835ff22446c8d 100644 --- a/llvm/test/MC/RISCV/attribute-arch.s +++ b/llvm/test/MC/RISCV/attribute-arch.s @@ -270,6 +270,9 @@ .attribute arch, "rv32iza64rs1p0" # CHECK: attribute 5, "rv32i2p1_za64rs1p0" +.attribute arch, "rv32izama16b" +# CHECK: attribute 5, "rv32i2p1_zama16b1p0" + .attribute arch, "rv32izawrs1p0" # CHECK: attribute 5, "rv32i2p1_zawrs1p0" diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp index 67012d2e6dc720..caf7bf0a317174 100644 --- a/llvm/unittests/Support/RISCVISAInfoTest.cpp +++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp @@ -769,6 +769,7 @@ R"(All available -march extensions for RISC-V za128rs 1.0 za64rs 1.0 zacas 1.0 + zama16b 1.0 zawrs 1.0 zfa 1.0 zfh 1.0 From d35a64363bb851045387717d2ef7d6449b7b547f Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Wed, 17 Apr 2024 08:33:07 +0200 Subject: [PATCH 214/300] Revert "Fix test from #83124 and #88902" This reverts commit 0a789ea8a829da345e46d8224d73b2ddaba6969f. Breaks builds, see discussion in https://github.com/llvm/llvm-project/pull/83124 --- clang/test/SemaCXX/PR41441.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/clang/test/SemaCXX/PR41441.cpp b/clang/test/SemaCXX/PR41441.cpp index d0f2917e52f211..0b012b33fce343 100644 --- a/clang/test/SemaCXX/PR41441.cpp +++ b/clang/test/SemaCXX/PR41441.cpp @@ -1,9 +1,6 @@ // RUN: %clang --target=x86_64-pc-linux -S -fno-discard-value-names -emit-llvm -o - %s | FileCheck %s -namespace std { - using size_t = decltype(sizeof(int)); -}; -void* operator new[](std::size_t, void*) noexcept; +#include // CHECK: call void @llvm.memset.p0.i64(ptr align 1 %x, i8 0, i64 8, i1 false) // CHECK: call void @llvm.memset.p0.i64(ptr align 16 %x, i8 0, i64 32, i1 false) From dbda478693104f78b142375862d66f3369ad8c78 Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Wed, 17 Apr 2024 08:33:44 +0200 Subject: [PATCH 215/300] Revert "[Clang][Sema] placement new initializes typedef array with correct size (#88902)" This reverts commit 5c6af605b307213453a9a043532b9293db21b5c6. Breaks builds, see discussion in https://github.com/llvm/llvm-project/pull/83124 --- .../{PR41441.cpp => instantiate-new-placement-size.cpp} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename clang/test/SemaCXX/{PR41441.cpp => instantiate-new-placement-size.cpp} (75%) diff --git a/clang/test/SemaCXX/PR41441.cpp b/clang/test/SemaCXX/instantiate-new-placement-size.cpp similarity index 75% rename from clang/test/SemaCXX/PR41441.cpp rename to clang/test/SemaCXX/instantiate-new-placement-size.cpp index 0b012b33fce343..7a29d3dee8491e 100644 --- a/clang/test/SemaCXX/PR41441.cpp +++ b/clang/test/SemaCXX/instantiate-new-placement-size.cpp @@ -1,5 +1,5 @@ -// RUN: %clang --target=x86_64-pc-linux -S -fno-discard-value-names -emit-llvm -o - %s | FileCheck %s - +// RUN: %clang -S -fno-discard-value-names -emit-llvm -o - %s | FileCheck %s +// Issue no: 41441 #include // CHECK: call void @llvm.memset.p0.i64(ptr align 1 %x, i8 0, i64 8, i1 false) From dd84d23adc84cc0c3d2b8fb8f0c353279d99d27a Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Wed, 17 Apr 2024 08:36:29 +0200 Subject: [PATCH 216/300] Revert "[Clang][Sema] placement new initializes typedef array with correct size (#83124)" This reverts commit c309dc6d0759b23b570c563f611530ff1a49e1bd. Breaks builds, see discussion in https://github.com/llvm/llvm-project/pull/83124 --- clang/docs/ReleaseNotes.rst | 1 - clang/lib/Sema/TreeTransform.h | 14 +------------ .../instantiate-new-placement-size.cpp | 20 ------------------- 3 files changed, 1 insertion(+), 34 deletions(-) delete mode 100644 clang/test/SemaCXX/instantiate-new-placement-size.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6099f8ab02f443..96ad92b540b47f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -534,7 +534,6 @@ Bug Fixes to C++ Support Fixes (#GH70604), (#GH79754), (#GH84163), (#GH84425), (#GH86054), (#GH86398), and (#GH86399). - Fix a crash when deducing ``auto`` from an invalid dereference (#GH88329). - Fix a crash in requires expression with templated base class member function. Fixes (#GH84020). -- Placement new initializes typedef array with correct size (#GH41441) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 0c7fdb357235e1..eb05783a6219dc 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -12864,19 +12864,6 @@ TreeTransform::TransformCXXNewExpr(CXXNewExpr *E) { ArraySize = NewArraySize.get(); } - // Per C++0x [expr.new]p5, the type being constructed may be a - // typedef of an array type. - QualType AllocType = AllocTypeInfo->getType(); - if (ArraySize) { - if (const ConstantArrayType *Array = - SemaRef.Context.getAsConstantArrayType(AllocType)) { - ArraySize = IntegerLiteral::Create(SemaRef.Context, Array->getSize(), - SemaRef.Context.getSizeType(), - E->getBeginLoc()); - AllocType = Array->getElementType(); - } - } - // Transform the placement arguments (if any). bool ArgumentChanged = false; SmallVector PlacementArgs; @@ -12938,6 +12925,7 @@ TreeTransform::TransformCXXNewExpr(CXXNewExpr *E) { return E; } + QualType AllocType = AllocTypeInfo->getType(); if (!ArraySize) { // If no array size was specified, but the new expression was // instantiated with an array type (e.g., "new T" where T is diff --git a/clang/test/SemaCXX/instantiate-new-placement-size.cpp b/clang/test/SemaCXX/instantiate-new-placement-size.cpp deleted file mode 100644 index 7a29d3dee8491e..00000000000000 --- a/clang/test/SemaCXX/instantiate-new-placement-size.cpp +++ /dev/null @@ -1,20 +0,0 @@ -// RUN: %clang -S -fno-discard-value-names -emit-llvm -o - %s | FileCheck %s -// Issue no: 41441 -#include - -// CHECK: call void @llvm.memset.p0.i64(ptr align 1 %x, i8 0, i64 8, i1 false) -// CHECK: call void @llvm.memset.p0.i64(ptr align 16 %x, i8 0, i64 32, i1 false) -template -void f() -{ - typedef TYPE TArray[8]; - - TArray x; - new(&x) TArray(); -} - -int main() -{ - f(); - f(); -} From bc3620d3a8b1be9534a5635431b0aa09cc50ff3c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 17 Apr 2024 08:50:14 +0200 Subject: [PATCH 217/300] AMDGPU: Move libcall simplify into PeepholeEP (#88853) We were running this immediately on the incoming IR, which is still littered with temporary allocas obscuring trivial values. This needs to run after initial SROA to handle sincos insertion. --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 13 +++- .../amdgpu-libcall-sincos-pass-ordering.ll | 77 +++++++++++++++++++ .../AMDGPU/amdgpu-simplify-libcall-sincos.ll | 17 ++-- llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll | 6 +- 4 files changed, 96 insertions(+), 17 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f7e552177d6f50..305a6c8c3b9262 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -655,9 +655,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks( PB.registerPipelineStartEPCallback( [](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; - FPM.addPass(AMDGPUUseNativeCallsPass()); - if (EnableLibCallSimplify && Level != OptimizationLevel::O0) - FPM.addPass(AMDGPUSimplifyLibCallsPass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); if (EnableHipStdPar) PM.addPass(HipStdParAcceleratorCodeSelectionPass()); @@ -681,6 +678,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks( PM.addPass(AMDGPUAlwaysInlinePass()); }); + PB.registerPeepholeEPCallback( + [](FunctionPassManager &FPM, OptimizationLevel Level) { + if (Level == OptimizationLevel::O0) + return; + + FPM.addPass(AMDGPUUseNativeCallsPass()); + if (EnableLibCallSimplify) + FPM.addPass(AMDGPUSimplifyLibCallsPass()); + }); + PB.registerCGSCCOptimizerLateEPCallback( [this](CGSCCPassManager &PM, OptimizationLevel Level) { if (Level == OptimizationLevel::O0) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll new file mode 100644 index 00000000000000..6b835bb4eef662 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 -amdgpu-prelink %s | FileCheck %s + +; Make sure that sin+cos -> sincos simplification happens after +; initial IR simplifications, otherwise we can't identify the common +; argument value. + +@.str = private unnamed_addr addrspace(4) constant [21 x i8] c"x: %f, y: %f, z: %f\0A\00", align 1 + +; Should have call to sincos declarations, not calls to the asm pseudo-libcalls +define protected amdgpu_kernel void @swdev456865(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, float noundef %x) #0 { +; CHECK-LABEL: define protected amdgpu_kernel void @swdev456865( +; CHECK-SAME: ptr addrspace(1) nocapture writeonly [[OUT0:%.*]], ptr addrspace(1) nocapture writeonly [[OUT1:%.*]], ptr addrspace(1) nocapture writeonly [[OUT2:%.*]], float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) +; CHECK-NEXT: [[I_I:%.*]] = call float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[I_I2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd float [[I_I]], [[I_I2]] +; CHECK-NEXT: [[CONV:%.*]] = fpext float [[X]] to double +; CHECK-NEXT: [[CONV5:%.*]] = fpext float [[ADD]] to double +; CHECK-NEXT: store double [[CONV]], ptr addrspace(1) [[OUT0]], align 8 +; CHECK-NEXT: store double [[CONV5]], ptr addrspace(1) [[OUT1]], align 8 +; CHECK-NEXT: store double [[CONV5]], ptr addrspace(1) [[OUT2]], align 8 +; CHECK-NEXT: ret void +; +entry: + %x.addr = alloca float, align 4, addrspace(5) + %y = alloca float, align 4, addrspace(5) + %z = alloca float, align 4, addrspace(5) + store float %x, ptr addrspace(5) %x.addr, align 4 + call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %y) + %i = load float, ptr addrspace(5) %x.addr, align 4 + %call = call float @_Z3sinf(float noundef %i) #3 + %i1 = load float, ptr addrspace(5) %x.addr, align 4 + %call1 = call float @_Z3cosf(float noundef %i1) #3 + %add = fadd float %call, %call1 + store float %add, ptr addrspace(5) %y, align 4 + call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %z) + %i2 = load float, ptr addrspace(5) %x.addr, align 4 + %call2 = call float @_Z3cosf(float noundef %i2) #3 + %i3 = load float, ptr addrspace(5) %x.addr, align 4 + %call3 = call float @_Z3sinf(float noundef %i3) #3 + %add4 = fadd float %call2, %call3 + store float %add4, ptr addrspace(5) %z, align 4 + %i4 = load float, ptr addrspace(5) %x.addr, align 4 + %conv = fpext float %i4 to double + %i5 = load float, ptr addrspace(5) %y, align 4 + %conv5 = fpext float %i5 to double + %i6 = load float, ptr addrspace(5) %z, align 4 + %conv6 = fpext float %i6 to double + store double %conv, ptr addrspace(1) %out0, align 8 + store double %conv5, ptr addrspace(1) %out1, align 8 + store double %conv6, ptr addrspace(1) %out2, align 8 + call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %z) + call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %y) + ret void +} + +declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) #1 +declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1 + +define internal float @_Z3cosf(float noundef %arg) #2 { +bb: + %i = tail call float asm "pseudo-libcall-cos %0, %1", "=v,v"(float noundef %arg) #2 + ret float %i +} + +define internal float @_Z3sinf(float noundef %arg) #2 { +bb: + %i = tail call float asm "pseudo-libcall-sin %0, %1", "=v,v"(float noundef %arg) #2 + ret float %i +} + +attributes #0 = { norecurse nounwind } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { mustprogress nofree norecurse nounwind willreturn memory(none) } +attributes #3 = { nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll index 5c56276eeb0f1c..9646d196da42f6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll @@ -884,10 +884,9 @@ entry: define float @sincos_f32_unused_result_cos(float %x) { ; CHECK-LABEL: define float @sincos_f32_unused_result_cos -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) -; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) +; CHECK-NEXT: [[TMP0:%.*]] = tail call contract float @_Z3sinf(float [[X]]) ; CHECK-NEXT: ret float [[TMP0]] ; entry: @@ -900,11 +899,9 @@ entry: define float @sincos_f32_unused_result_sin(float %x) { ; CHECK-LABEL: define float @sincos_f32_unused_result_sin -; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR4]] { +; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) -; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = tail call contract float @_Z3cosf(float [[X]]) ; CHECK-NEXT: ret float [[TMP1]] ; entry: @@ -917,13 +914,11 @@ entry: define void @sincos_f32_repeated_uses(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) { ; CHECK-LABEL: define void @sincos_f32_repeated_uses -; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] { +; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: [[__SINCOS_3:%.*]] = alloca float, align 4, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_3]]) -; CHECK-NEXT: [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_3]], align 4 ; CHECK-NEXT: store volatile float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4 ; CHECK-NEXT: store volatile float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4 ; CHECK-NEXT: store volatile float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll index 731a88278e512c..204c8140d3f17d 100644 --- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -278,7 +278,7 @@ entry: ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half ; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 5.000000e-01) -; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp) +; GCN-PRELINK: %__pow2sqrt = tail call fast float @llvm.sqrt.f32(float %tmp) define amdgpu_kernel void @test_pow_half(ptr addrspace(1) nocapture %a) { entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 @@ -476,7 +476,7 @@ declare float @_Z5rootnfi(float, i32) ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 2) -; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp) +; GCN-PRELINK: %__rootn2sqrt = tail call fast float @llvm.sqrt.f32(float %tmp) define amdgpu_kernel void @test_rootn_2(ptr addrspace(1) nocapture %a) { entry: %tmp = load float, ptr addrspace(1) %a, align 4 @@ -838,5 +838,5 @@ entry: ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]] ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind } -; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nofree nounwind memory(read) } +; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) } attributes #0 = { nounwind } From e11b17a4ed90e74147594012207fc35a60515944 Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Wed, 17 Apr 2024 09:51:24 +0300 Subject: [PATCH 218/300] [clang][NFC] Refactor `Sema::CheckedConversionKind` Convert it to scoped enum, and move it to namespace scope to enable forward declarations. --- clang/include/clang/Sema/Sema.h | 65 +++++++++++++++---------- clang/lib/Sema/SemaCast.cpp | 73 ++++++++++++++--------------- clang/lib/Sema/SemaExpr.cpp | 10 ++-- clang/lib/Sema/SemaExprCXX.cpp | 8 ++-- clang/lib/Sema/SemaExprObjC.cpp | 45 +++++++++--------- clang/lib/Sema/SemaInit.cpp | 10 ++-- clang/lib/Sema/SemaOverload.cpp | 10 ++-- clang/lib/Sema/SemaPseudoObject.cpp | 2 +- 8 files changed, 119 insertions(+), 104 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 091c5c02f75df9..281e3b91de1d0c 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -437,6 +437,20 @@ enum class CXXSpecialMemberKind { Invalid }; +/// The kind of conversion being performed. +enum class CheckedConversionKind { + /// An implicit conversion. + Implicit, + /// A C-style cast. + CStyleCast, + /// A functional-style cast. + FunctionalCast, + /// A cast other than a C-style cast. + OtherCast, + /// A conversion for an operand of a builtin overloaded operator. + ForBuiltinOverloadedOp +}; + /// Sema - This implements semantic analysis and AST building for C. /// \nosubgrouping class Sema final : public SemaBase { @@ -700,28 +714,27 @@ class Sema final : public SemaBase { void checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D = nullptr); - /// The kind of conversion being performed. - enum CheckedConversionKind { - /// An implicit conversion. - CCK_ImplicitConversion, - /// A C-style cast. - CCK_CStyleCast, - /// A functional-style cast. - CCK_FunctionalCast, - /// A cast other than a C-style cast. - CCK_OtherCast, - /// A conversion for an operand of a builtin overloaded operator. - CCK_ForBuiltinOverloadedOp - }; + // /// The kind of conversion being performed. + // enum CheckedConversionKind { + // /// An implicit conversion. + // CCK_ImplicitConversion, + // /// A C-style cast. + // CCK_CStyleCast, + // /// A functional-style cast. + // CCK_FunctionalCast, + // /// A cast other than a C-style cast. + // CCK_OtherCast, + // /// A conversion for an operand of a builtin overloaded operator. + // CCK_ForBuiltinOverloadedOp + // }; /// ImpCastExprToType - If Expr is not of type 'Type', insert an implicit /// cast. If there is already an implicit cast, merge into the existing one. /// If isLvalue, the result of the cast is an lvalue. - ExprResult - ImpCastExprToType(Expr *E, QualType Type, CastKind CK, - ExprValueKind VK = VK_PRValue, - const CXXCastPath *BasePath = nullptr, - CheckedConversionKind CCK = CCK_ImplicitConversion); + ExprResult ImpCastExprToType( + Expr *E, QualType Type, CastKind CK, ExprValueKind VK = VK_PRValue, + const CXXCastPath *BasePath = nullptr, + CheckedConversionKind CCK = CheckedConversionKind::Implicit); /// ScalarTypeToBooleanCastKind - Returns the cast kind corresponding /// to the conversion from scalar type ScalarTy to the Boolean type. @@ -1781,8 +1794,9 @@ class Sema final : public SemaBase { public: static bool isCast(CheckedConversionKind CCK) { - return CCK == CCK_CStyleCast || CCK == CCK_FunctionalCast || - CCK == CCK_OtherCast; + return CCK == CheckedConversionKind::CStyleCast || + CCK == CheckedConversionKind::FunctionalCast || + CCK == CheckedConversionKind::OtherCast; } /// ActOnCXXNamedCast - Parse @@ -6739,11 +6753,10 @@ class Sema final : public SemaBase { bool IsStringLiteralToNonConstPointerConversion(Expr *From, QualType ToType); - ExprResult - PerformImplicitConversion(Expr *From, QualType ToType, - const ImplicitConversionSequence &ICS, - AssignmentAction Action, - CheckedConversionKind CCK = CCK_ImplicitConversion); + ExprResult PerformImplicitConversion( + Expr *From, QualType ToType, const ImplicitConversionSequence &ICS, + AssignmentAction Action, + CheckedConversionKind CCK = CheckedConversionKind::Implicit); ExprResult PerformImplicitConversion(Expr *From, QualType ToType, const StandardConversionSequence &SCS, AssignmentAction Action, @@ -7064,7 +7077,7 @@ class Sema final : public SemaBase { ExprResult PerformQualificationConversion( Expr *E, QualType Ty, ExprValueKind VK = VK_PRValue, - CheckedConversionKind CCK = CCK_ImplicitConversion); + CheckedConversionKind CCK = CheckedConversionKind::Implicit); bool CanPerformCopyInitialization(const InitializedEntity &Entity, ExprResult Init); diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index b0c28531fe8738..126fd3797417ca 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -155,7 +155,7 @@ namespace { Self.CheckCastAlign(SrcExpr.get(), DestType, OpRange); } - void checkObjCConversion(Sema::CheckedConversionKind CCK) { + void checkObjCConversion(CheckedConversionKind CCK) { assert(Self.getLangOpts().allowsNonTrivialObjCLifetimeQualifiers()); Expr *src = SrcExpr.get(); @@ -248,18 +248,14 @@ static TryCastResult TryStaticMemberPointerUpcast(Sema &Self, ExprResult &SrcExp CastKind &Kind, CXXCastPath &BasePath); -static TryCastResult TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr, - QualType DestType, - Sema::CheckedConversionKind CCK, - SourceRange OpRange, - unsigned &msg, CastKind &Kind, - bool ListInitialization); +static TryCastResult +TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr, QualType DestType, + CheckedConversionKind CCK, SourceRange OpRange, + unsigned &msg, CastKind &Kind, bool ListInitialization); static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr, - QualType DestType, - Sema::CheckedConversionKind CCK, - SourceRange OpRange, - unsigned &msg, CastKind &Kind, - CXXCastPath &BasePath, + QualType DestType, CheckedConversionKind CCK, + SourceRange OpRange, unsigned &msg, + CastKind &Kind, CXXCastPath &BasePath, bool ListInitialization); static TryCastResult TryConstCast(Sema &Self, ExprResult &SrcExpr, QualType DestType, bool CStyle, @@ -1223,7 +1219,7 @@ void CastOperation::CheckReinterpretCast() { if (isValidCast(tcr)) { if (Self.getLangOpts().allowsNonTrivialObjCLifetimeQualifiers()) - checkObjCConversion(Sema::CCK_OtherCast); + checkObjCConversion(CheckedConversionKind::OtherCast); DiagnoseReinterpretUpDownCast(Self, SrcExpr.get(), DestType, OpRange); if (unsigned DiagID = checkCastFunctionType(Self, SrcExpr, DestType)) @@ -1274,9 +1270,9 @@ void CastOperation::CheckStaticCast() { } unsigned msg = diag::err_bad_cxx_cast_generic; - TryCastResult tcr - = TryStaticCast(Self, SrcExpr, DestType, Sema::CCK_OtherCast, OpRange, msg, - Kind, BasePath, /*ListInitialization=*/false); + TryCastResult tcr = + TryStaticCast(Self, SrcExpr, DestType, CheckedConversionKind::OtherCast, + OpRange, msg, Kind, BasePath, /*ListInitialization=*/false); if (tcr != TC_Success && msg != 0) { if (SrcExpr.isInvalid()) return; @@ -1296,7 +1292,7 @@ void CastOperation::CheckStaticCast() { if (Kind == CK_BitCast) checkCastAlign(); if (Self.getLangOpts().allowsNonTrivialObjCLifetimeQualifiers()) - checkObjCConversion(Sema::CCK_OtherCast); + checkObjCConversion(CheckedConversionKind::OtherCast); } else { SrcExpr = ExprError(); } @@ -1317,14 +1313,13 @@ static bool IsAddressSpaceConversion(QualType SrcType, QualType DestType) { /// possible. If @p CStyle, ignore access restrictions on hierarchy casting /// and casting away constness. static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr, - QualType DestType, - Sema::CheckedConversionKind CCK, + QualType DestType, CheckedConversionKind CCK, SourceRange OpRange, unsigned &msg, CastKind &Kind, CXXCastPath &BasePath, bool ListInitialization) { // Determine whether we have the semantics of a C-style cast. - bool CStyle - = (CCK == Sema::CCK_CStyleCast || CCK == Sema::CCK_FunctionalCast); + bool CStyle = (CCK == CheckedConversionKind::CStyleCast || + CCK == CheckedConversionKind::FunctionalCast); // The order the tests is not entirely arbitrary. There is one conversion // that can be handled in two different ways. Given: @@ -1884,11 +1879,11 @@ TryStaticMemberPointerUpcast(Sema &Self, ExprResult &SrcExpr, QualType SrcType, /// /// An expression e can be explicitly converted to a type T using a /// @c static_cast if the declaration "T t(e);" is well-formed [...]. -TryCastResult -TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr, QualType DestType, - Sema::CheckedConversionKind CCK, - SourceRange OpRange, unsigned &msg, - CastKind &Kind, bool ListInitialization) { +TryCastResult TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr, + QualType DestType, + CheckedConversionKind CCK, + SourceRange OpRange, unsigned &msg, + CastKind &Kind, bool ListInitialization) { if (DestType->isRecordType()) { if (Self.RequireCompleteType(OpRange.getBegin(), DestType, diag::err_bad_cast_incomplete) || @@ -1900,13 +1895,14 @@ TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr, QualType DestType, } InitializedEntity Entity = InitializedEntity::InitializeTemporary(DestType); - InitializationKind InitKind - = (CCK == Sema::CCK_CStyleCast) - ? InitializationKind::CreateCStyleCast(OpRange.getBegin(), OpRange, - ListInitialization) - : (CCK == Sema::CCK_FunctionalCast) - ? InitializationKind::CreateFunctionalCast(OpRange, ListInitialization) - : InitializationKind::CreateCast(OpRange); + InitializationKind InitKind = + (CCK == CheckedConversionKind::CStyleCast) + ? InitializationKind::CreateCStyleCast(OpRange.getBegin(), OpRange, + ListInitialization) + : (CCK == CheckedConversionKind::FunctionalCast) + ? InitializationKind::CreateFunctionalCast(OpRange, + ListInitialization) + : InitializationKind::CreateCast(OpRange); Expr *SrcExprRaw = SrcExpr.get(); // FIXME: Per DR242, we should check for an implicit conversion sequence // or for a constructor that could be invoked by direct-initialization @@ -1918,8 +1914,8 @@ TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr, QualType DestType, // There is no other way that works. // On the other hand, if we're checking a C-style cast, we've still got // the reinterpret_cast way. - bool CStyle - = (CCK == Sema::CCK_CStyleCast || CCK == Sema::CCK_FunctionalCast); + bool CStyle = (CCK == CheckedConversionKind::CStyleCast || + CCK == CheckedConversionKind::FunctionalCast); if (InitSeq.Failed() && (CStyle || !DestType->isReferenceType())) return TC_NotApplicable; @@ -2814,8 +2810,9 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle, if (isValidCast(tcr)) Kind = CK_NoOp; - Sema::CheckedConversionKind CCK = - FunctionalStyle ? Sema::CCK_FunctionalCast : Sema::CCK_CStyleCast; + CheckedConversionKind CCK = FunctionalStyle + ? CheckedConversionKind::FunctionalCast + : CheckedConversionKind::CStyleCast; if (tcr == TC_NotApplicable) { tcr = TryAddressSpaceCast(Self, SrcExpr, DestType, /*CStyle*/ true, msg, Kind); @@ -3201,7 +3198,7 @@ void CastOperation::CheckCStyleCast() { // ARC imposes extra restrictions on casts. if (Self.getLangOpts().allowsNonTrivialObjCLifetimeQualifiers()) { - checkObjCConversion(Sema::CCK_CStyleCast); + checkObjCConversion(CheckedConversionKind::CStyleCast); if (SrcExpr.isInvalid()) return; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 7c3faba0f78819..d2c77ad61644f0 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -10177,8 +10177,9 @@ Sema::CheckSingleAssignmentConstraints(QualType LHSType, ExprResult &CallerRHS, // diagnostics and just checking for errors, e.g., during overload // resolution, return Incompatible to indicate the failure. if (getLangOpts().allowsNonTrivialObjCLifetimeQualifiers() && - CheckObjCConversion(SourceRange(), Ty, E, CCK_ImplicitConversion, - Diagnose, DiagnoseCFAudited) != ACR_okay) { + CheckObjCConversion(SourceRange(), Ty, E, + CheckedConversionKind::Implicit, Diagnose, + DiagnoseCFAudited) != ACR_okay) { if (!Diagnose) return Incompatible; } @@ -12899,14 +12900,15 @@ QualType Sema::CheckCompareOperands(ExprResult &LHS, ExprResult &RHS, Expr *E = LHS.get(); if (getLangOpts().ObjCAutoRefCount) CheckObjCConversion(SourceRange(), RHSType, E, - CCK_ImplicitConversion); + CheckedConversionKind::Implicit); LHS = ImpCastExprToType(E, RHSType, RPT ? CK_BitCast :CK_CPointerToObjCPointerCast); } else { Expr *E = RHS.get(); if (getLangOpts().ObjCAutoRefCount) - CheckObjCConversion(SourceRange(), LHSType, E, CCK_ImplicitConversion, + CheckObjCConversion(SourceRange(), LHSType, E, + CheckedConversionKind::Implicit, /*Diagnose=*/true, /*DiagnoseCFAudited=*/false, Opc); RHS = ImpCastExprToType(E, LHSType, diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 74ed3fe7bd5201..f4a91ececfbb57 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -4250,7 +4250,8 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, AssignmentAction Action, CheckedConversionKind CCK) { // C++ [over.match.oper]p7: [...] operands of class type are converted [...] - if (CCK == CCK_ForBuiltinOverloadedOp && !From->getType()->isRecordType()) + if (CCK == CheckedConversionKind::ForBuiltinOverloadedOp && + !From->getType()->isRecordType()) return From; switch (ICS.getKind()) { @@ -4311,7 +4312,7 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, // C++ [over.match.oper]p7: // [...] the second standard conversion sequence of a user-defined // conversion sequence is not applied. - if (CCK == CCK_ForBuiltinOverloadedOp) + if (CCK == CheckedConversionKind::ForBuiltinOverloadedOp) return From; return PerformImplicitConversion(From, ToType, ICS.UserDefined.After, @@ -4352,7 +4353,8 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, const StandardConversionSequence& SCS, AssignmentAction Action, CheckedConversionKind CCK) { - bool CStyle = (CCK == CCK_CStyleCast || CCK == CCK_FunctionalCast); + bool CStyle = (CCK == CheckedConversionKind::CStyleCast || + CCK == CheckedConversionKind::FunctionalCast); // Overall FIXME: we are recomputing too many types here and doing far too // much extra work. What this means is that we need to keep track of more diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp index 3148f0db6e20c8..b13a9d426983b7 100644 --- a/clang/lib/Sema/SemaExprObjC.cpp +++ b/clang/lib/Sema/SemaExprObjC.cpp @@ -3745,22 +3745,22 @@ bool Sema::isKnownName(StringRef name) { template static void addFixitForObjCARCConversion( - Sema &S, DiagBuilderT &DiagB, Sema::CheckedConversionKind CCK, + Sema &S, DiagBuilderT &DiagB, CheckedConversionKind CCK, SourceLocation afterLParen, QualType castType, Expr *castExpr, Expr *realCast, const char *bridgeKeyword, const char *CFBridgeName) { // We handle C-style and implicit casts here. switch (CCK) { - case Sema::CCK_ImplicitConversion: - case Sema::CCK_ForBuiltinOverloadedOp: - case Sema::CCK_CStyleCast: - case Sema::CCK_OtherCast: + case CheckedConversionKind::Implicit: + case CheckedConversionKind::ForBuiltinOverloadedOp: + case CheckedConversionKind::CStyleCast: + case CheckedConversionKind::OtherCast: break; - case Sema::CCK_FunctionalCast: + case CheckedConversionKind::FunctionalCast: return; } if (CFBridgeName) { - if (CCK == Sema::CCK_OtherCast) { + if (CCK == CheckedConversionKind::OtherCast) { if (const CXXNamedCastExpr *NCE = dyn_cast(realCast)) { SourceRange range(NCE->getOperatorLoc(), NCE->getAngleBrackets().getEnd()); @@ -3805,9 +3805,9 @@ static void addFixitForObjCARCConversion( return; } - if (CCK == Sema::CCK_CStyleCast) { + if (CCK == CheckedConversionKind::CStyleCast) { DiagB.AddFixItHint(FixItHint::CreateInsertion(afterLParen, bridgeKeyword)); - } else if (CCK == Sema::CCK_OtherCast) { + } else if (CCK == CheckedConversionKind::OtherCast) { if (const CXXNamedCastExpr *NCE = dyn_cast(realCast)) { std::string castCode = "("; castCode += bridgeKeyword; @@ -3866,12 +3866,12 @@ static ObjCBridgeRelatedAttr *ObjCBridgeRelatedAttrFromType(QualType T, return nullptr; } -static void -diagnoseObjCARCConversion(Sema &S, SourceRange castRange, - QualType castType, ARCConversionTypeClass castACTC, - Expr *castExpr, Expr *realCast, - ARCConversionTypeClass exprACTC, - Sema::CheckedConversionKind CCK) { +static void diagnoseObjCARCConversion(Sema &S, SourceRange castRange, + QualType castType, + ARCConversionTypeClass castACTC, + Expr *castExpr, Expr *realCast, + ARCConversionTypeClass exprACTC, + CheckedConversionKind CCK) { SourceLocation loc = (castRange.isValid() ? castRange.getBegin() : castExpr->getExprLoc()); @@ -3927,7 +3927,7 @@ diagnoseObjCARCConversion(Sema &S, SourceRange castRange, assert(CreateRule != ACC_bottom && "This cast should already be accepted."); if (CreateRule != ACC_plusOne) { - auto DiagB = (CCK != Sema::CCK_OtherCast) + auto DiagB = (CCK != CheckedConversionKind::OtherCast) ? S.Diag(noteLoc, diag::note_arc_bridge) : S.Diag(noteLoc, diag::note_arc_cstyle_bridge); @@ -3937,7 +3937,7 @@ diagnoseObjCARCConversion(Sema &S, SourceRange castRange, } if (CreateRule != ACC_plusZero) { - auto DiagB = (CCK == Sema::CCK_OtherCast && !br) + auto DiagB = (CCK == CheckedConversionKind::OtherCast && !br) ? S.Diag(noteLoc, diag::note_arc_cstyle_bridge_transfer) << castExprType : S.Diag(br ? castExpr->getExprLoc() : noteLoc, @@ -3968,7 +3968,7 @@ diagnoseObjCARCConversion(Sema &S, SourceRange castRange, assert(CreateRule != ACC_bottom && "This cast should already be accepted."); if (CreateRule != ACC_plusOne) { - auto DiagB = (CCK != Sema::CCK_OtherCast) + auto DiagB = (CCK != CheckedConversionKind::OtherCast) ? S.Diag(noteLoc, diag::note_arc_bridge) : S.Diag(noteLoc, diag::note_arc_cstyle_bridge); addFixitForObjCARCConversion(S, DiagB, CCK, afterLParen, @@ -3977,7 +3977,7 @@ diagnoseObjCARCConversion(Sema &S, SourceRange castRange, } if (CreateRule != ACC_plusZero) { - auto DiagB = (CCK == Sema::CCK_OtherCast && !br) + auto DiagB = (CCK == CheckedConversionKind::OtherCast && !br) ? S.Diag(noteLoc, diag::note_arc_cstyle_bridge_retained) << castType : S.Diag(br ? castExpr->getExprLoc() : noteLoc, @@ -4403,7 +4403,8 @@ Sema::CheckObjCConversion(SourceRange castRange, QualType castType, // Check for viability and report error if casting an rvalue to a // life-time qualifier. if (castACTC == ACTC_retainable && - (CCK == CCK_CStyleCast || CCK == CCK_OtherCast) && + (CCK == CheckedConversionKind::CStyleCast || + CCK == CheckedConversionKind::OtherCast) && castType != castExprType) { const Type *DT = castType.getTypePtr(); QualType QDT = castType; @@ -4517,11 +4518,11 @@ void Sema::diagnoseARCUnbridgedCast(Expr *e) { if (CStyleCastExpr *cast = dyn_cast(realCast)) { castRange = SourceRange(cast->getLParenLoc(), cast->getRParenLoc()); castType = cast->getTypeAsWritten(); - CCK = CCK_CStyleCast; + CCK = CheckedConversionKind::CStyleCast; } else if (ExplicitCastExpr *cast = dyn_cast(realCast)) { castRange = cast->getTypeInfoAsWritten()->getTypeLoc().getSourceRange(); castType = cast->getTypeAsWritten(); - CCK = CCK_OtherCast; + CCK = CheckedConversionKind::OtherCast; } else { llvm_unreachable("Unexpected ImplicitCastExpr"); } diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index fb7a80ab02846c..e86f7578ff0c05 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -9057,11 +9057,11 @@ ExprResult InitializationSequence::Perform(Sema &S, } } - Sema::CheckedConversionKind CCK - = Kind.isCStyleCast()? Sema::CCK_CStyleCast - : Kind.isFunctionalCast()? Sema::CCK_FunctionalCast - : Kind.isExplicitCast()? Sema::CCK_OtherCast - : Sema::CCK_ImplicitConversion; + CheckedConversionKind CCK = + Kind.isCStyleCast() ? CheckedConversionKind::CStyleCast + : Kind.isFunctionalCast() ? CheckedConversionKind::FunctionalCast + : Kind.isExplicitCast() ? CheckedConversionKind::OtherCast + : CheckedConversionKind::Implicit; ExprResult CurInitExprRes = S.PerformImplicitConversion(CurInit.get(), Step->Type, *Step->ICS, getAssignmentAction(Entity), CCK); diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 227ef564ba3e08..adc319e97b7625 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -14506,7 +14506,7 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc, // operator node. ExprResult InputRes = PerformImplicitConversion( Input, Best->BuiltinParamTypes[0], Best->Conversions[0], AA_Passing, - CCK_ForBuiltinOverloadedOp); + CheckedConversionKind::ForBuiltinOverloadedOp); if (InputRes.isInvalid()) return ExprError(); Input = InputRes.get(); @@ -14989,14 +14989,14 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc, // operator node. ExprResult ArgsRes0 = PerformImplicitConversion( Args[0], Best->BuiltinParamTypes[0], Best->Conversions[0], - AA_Passing, CCK_ForBuiltinOverloadedOp); + AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp); if (ArgsRes0.isInvalid()) return ExprError(); Args[0] = ArgsRes0.get(); ExprResult ArgsRes1 = PerformImplicitConversion( Args[1], Best->BuiltinParamTypes[1], Best->Conversions[1], - AA_Passing, CCK_ForBuiltinOverloadedOp); + AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp); if (ArgsRes1.isInvalid()) return ExprError(); Args[1] = ArgsRes1.get(); @@ -15367,14 +15367,14 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc, // operator node. ExprResult ArgsRes0 = PerformImplicitConversion( Args[0], Best->BuiltinParamTypes[0], Best->Conversions[0], - AA_Passing, CCK_ForBuiltinOverloadedOp); + AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp); if (ArgsRes0.isInvalid()) return ExprError(); Args[0] = ArgsRes0.get(); ExprResult ArgsRes1 = PerformImplicitConversion( Args[1], Best->BuiltinParamTypes[1], Best->Conversions[1], - AA_Passing, CCK_ForBuiltinOverloadedOp); + AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp); if (ArgsRes1.isInvalid()) return ExprError(); Args[1] = ArgsRes1.get(); diff --git a/clang/lib/Sema/SemaPseudoObject.cpp b/clang/lib/Sema/SemaPseudoObject.cpp index 82774760b34d44..c6a0a182d3583a 100644 --- a/clang/lib/Sema/SemaPseudoObject.cpp +++ b/clang/lib/Sema/SemaPseudoObject.cpp @@ -1136,7 +1136,7 @@ static void CheckKeyForObjCARCConversion(Sema &S, QualType ContainerT, return; QualType T = Getter->parameters()[0]->getType(); S.CheckObjCConversion(Key->getSourceRange(), T, Key, - Sema::CCK_ImplicitConversion); + CheckedConversionKind::Implicit); } bool ObjCSubscriptOpBuilder::findAtIndexGetter() { From 49b209d0d1833a339e66735e1288c1805224603e Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Wed, 17 Apr 2024 09:11:43 +0200 Subject: [PATCH 219/300] Revert "[Libomptarget] Rework Record & Replay to be a plugin member" (#89028) Reverts llvm/llvm-project#88928 This broke the AMDGPU buildbots: https://lab.llvm.org/buildbot/#/builders/193/builds/50201 https://lab.llvm.org/staging/#/builders/185/builds/5565 https://lab.llvm.org/buildbot/#/builders/259/builds/2955 --- .../common/include/PluginInterface.h | 11 ------ .../common/src/PluginInterface.cpp | 39 +++++++------------ 2 files changed, 15 insertions(+), 35 deletions(-) diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h index 7f05464f36c1f3..79e8464bfda5c1 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h @@ -45,8 +45,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Triple.h" -struct RecordReplayTy; - namespace llvm { namespace omp { namespace target { @@ -1033,12 +1031,6 @@ struct GenericPluginTy { return *RPCServer; } - /// Get a reference to the R&R interface for this plugin. - RecordReplayTy &getRecordAndReplay() const { - assert(RecordReplay && "R&R not initialized"); - return *RecordReplay; - } - /// Get the OpenMP requires flags set for this plugin. int64_t getRequiresFlags() const { return RequiresFlags; } @@ -1228,9 +1220,6 @@ struct GenericPluginTy { /// The interface between the plugin and the GPU for host services. RPCServerTy *RPCServer; - - /// The interface into the record-and-replay functionality. - RecordReplayTy *RecordReplay; }; namespace Plugin { diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp index 6df9798f12e3d0..b5f3c45c835fdb 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp @@ -362,6 +362,8 @@ struct RecordReplayTy { } }; +static RecordReplayTy RecordReplay; + // Extract the mapping of host function pointers to device function pointers // from the entry table. Functions marked as 'indirect' in OpenMP will have // offloading entries generated for them which map the host's function pointer @@ -471,8 +473,7 @@ GenericKernelTy::getKernelLaunchEnvironment( // Ctor/Dtor have no arguments, replaying uses the original kernel launch // environment. Older versions of the compiler do not generate a kernel // launch environment. - if (isCtorOrDtor() || - GenericDevice.Plugin.getRecordAndReplay().isReplaying() || + if (isCtorOrDtor() || RecordReplay.isReplaying() || Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR) return nullptr; @@ -561,7 +562,6 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, // Record the kernel description after we modified the argument count and num // blocks/threads. - RecordReplayTy &RecordReplay = GenericDevice.Plugin.getRecordAndReplay(); if (RecordReplay.isRecording()) { RecordReplay.saveImage(getName(), getImage()); RecordReplay.saveKernelInput(getName(), getImage()); @@ -839,6 +839,9 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { delete MemoryManager; MemoryManager = nullptr; + if (RecordReplay.isRecordingOrReplaying()) + RecordReplay.deinit(); + if (RPCServer) if (auto Err = RPCServer->deinitDevice(*this)) return Err; @@ -855,7 +858,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { return deinitImpl(); } - Expected GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, const __tgt_device_image *InputTgtImage) { @@ -890,8 +892,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, return std::move(Err); // Setup the global device memory pool if needed. - if (!Plugin.getRecordAndReplay().isReplaying() && - shouldSetupDeviceMemoryPool()) { + if (!RecordReplay.isReplaying() && shouldSetupDeviceMemoryPool()) { uint64_t HeapSize; auto SizeOrErr = getDeviceHeapSize(HeapSize); if (SizeOrErr) { @@ -1306,8 +1307,8 @@ Expected GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind) { void *Alloc = nullptr; - if (Plugin.getRecordAndReplay().isRecordingOrReplaying()) - return Plugin.getRecordAndReplay().alloc(Size); + if (RecordReplay.isRecordingOrReplaying()) + return RecordReplay.alloc(Size); switch (Kind) { case TARGET_ALLOC_DEFAULT: @@ -1343,7 +1344,7 @@ Expected GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { // Free is a noop when recording or replaying. - if (Plugin.getRecordAndReplay().isRecordingOrReplaying()) + if (RecordReplay.isRecordingOrReplaying()) return Plugin::success(); int Res; @@ -1395,7 +1396,6 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo) { - RecordReplayTy &RecordReplay = Plugin.getRecordAndReplay(); AsyncInfoWrapperTy AsyncInfoWrapper( *this, RecordReplay.isRecordingOrReplaying() ? nullptr : AsyncInfo); @@ -1495,9 +1495,6 @@ Error GenericPluginTy::init() { RPCServer = new RPCServerTy(*this); assert(RPCServer && "Invalid RPC server"); - RecordReplay = new RecordReplayTy(); - assert(RecordReplay && "Invalid Record and Replay handler"); - return Plugin::success(); } @@ -1511,9 +1508,6 @@ Error GenericPluginTy::deinit() { assert(!Devices[DeviceId] && "Device was not deinitialized"); } - if (RecordReplay && RecordReplay->isRecordingOrReplaying()) - RecordReplay->deinit(); - // There is no global handler if no device is available. if (GlobalHandler) delete GlobalHandler; @@ -1521,9 +1515,6 @@ Error GenericPluginTy::deinit() { if (RPCServer) delete RPCServer; - if (RecordReplay) - delete RecordReplay; - // Perform last deinitializations on the plugin. return deinitImpl(); } @@ -1639,12 +1630,12 @@ int32_t GenericPluginTy::initialize_record_replay(int32_t DeviceId, isRecord ? RecordReplayTy::RRStatusTy::RRRecording : RecordReplayTy::RRStatusTy::RRReplaying; - if (auto Err = RecordReplay->init(&Device, MemorySize, VAddr, Status, - SaveOutput, ReqPtrArgOffset)) { + if (auto Err = RecordReplay.init(&Device, MemorySize, VAddr, Status, + SaveOutput, ReqPtrArgOffset)) { REPORT("WARNING RR did not intialize RR-properly with %lu bytes" "(Error: %s)\n", MemorySize, toString(std::move(Err)).data()); - RecordReplay->setStatus(RecordReplayTy::RRStatusTy::RRDeactivated); + RecordReplay.setStatus(RecordReplayTy::RRStatusTy::RRDeactivated); if (!isRecord) { return OFFLOAD_FAIL; @@ -1993,8 +1984,8 @@ int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size, assert(DevicePtr && "Invalid device global's address"); // Save the loaded globals if we are recording. - if (getRecordAndReplay().isRecording()) - getRecordAndReplay().addEntry(Name, Size, *DevicePtr); + if (RecordReplay.isRecording()) + RecordReplay.addEntry(Name, Size, *DevicePtr); return OFFLOAD_SUCCESS; } From 9f3334e9932fc9b55cd3590b140913222454c031 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Wed, 17 Apr 2024 09:20:55 +0200 Subject: [PATCH 220/300] [mlir][SparseTensor] Add missing dependent dialect to pass (#88870) This commit fixes the following error when stopping the sparse compiler pipeline after bufferization (e.g., with `test-analysis-only`): ``` LLVM ERROR: Building op `vector.print` but it isn't known in this MLIRContext: the dialect may not be loaded or this operation hasn't been added by the dialect. See also https://mlir.llvm.org/getting_started/Faq/#registered-loaded-dependent-whats-up-with-dialects-management ``` --- mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td | 1 + .../Transforms/SparsificationAndBufferizationPass.cpp | 1 + .../Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td index 4706d5ba2f218c..2f844cee5ff528 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td @@ -460,6 +460,7 @@ def SparsificationAndBufferization : Pass<"sparsification-and-bufferization", "M "memref::MemRefDialect", "scf::SCFDialect", "sparse_tensor::SparseTensorDialect", + "vector::VectorDialect" ]; } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp index f497be6e48eba1..3a8972072ac3b1 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp @@ -24,6 +24,7 @@ #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h" #include "mlir/Dialect/SparseTensor/Transforms/Passes.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/Passes.h" diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir index 2ff73923c8327d..467b671500e173 100755 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir @@ -30,6 +30,10 @@ // Do the same run, but now with direct IR generation and VLA vectorization. // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %} +// Test that test-bufferization-analysis-only works. This option is useful +// for understanding why buffer copies were inserted. +// RUN: mlir-opt %s --sparsifier="test-bufferization-analysis-only" -o /dev/null + #Sparse1 = #sparse_tensor.encoding<{ map = (i, j, k) -> ( j : compressed, From 889dfd4ab35892840f2bd2d6d7fed6fac025e18e Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Wed, 17 Apr 2024 10:04:22 +0200 Subject: [PATCH 221/300] [libc][msan] Fix "non-constexpr function '__msan_unpoison' cannot be used in a constant expression" (#88719) Prior to this patch, calling `cpp::bit_cast` in `constexpr` expressions under `-fsanitize=memory` would fail with the following message "non-constexpr function '__msan_unpoison' cannot be used in a constant expression". This patch makes sure that the `__msan_unpoison` expression is guarded by `!__builtin_is_constant_evaluated()`. --- libc/src/__support/macros/sanitizer.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libc/src/__support/macros/sanitizer.h b/libc/src/__support/macros/sanitizer.h index bd9b62b7121a14..baf44f7996cabb 100644 --- a/libc/src/__support/macros/sanitizer.h +++ b/libc/src/__support/macros/sanitizer.h @@ -47,14 +47,13 @@ // Functions to unpoison memory //----------------------------------------------------------------------------- -#if defined(LIBC_HAVE_MEMORY_SANITIZER) && __has_builtin(__builtin_constant_p) +#if defined(LIBC_HAVE_MEMORY_SANITIZER) // Only perform MSAN unpoison in non-constexpr context. #include #define MSAN_UNPOISON(addr, size) \ do { \ - if (!__builtin_constant_p(*addr)) { \ + if (!__builtin_is_constant_evaluated()) \ __msan_unpoison(addr, size); \ - } \ } while (0) #else #define MSAN_UNPOISON(ptr, size) From 17b86d5978af8d171fa28763a9e5eba3ce93713a Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Wed, 17 Apr 2024 15:59:37 +0800 Subject: [PATCH 222/300] [X86][NFC] Add test cases for pr88958 --- llvm/test/CodeGen/X86/combine-ptest.ll | 42 ++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll index 337edef96beee2..3a695bfc6234db 100644 --- a/llvm/test/CodeGen/X86/combine-ptest.ll +++ b/llvm/test/CodeGen/X86/combine-ptest.ll @@ -397,6 +397,48 @@ define i1 @PR38788(<4 x i32> %0, <4 x i32> %1) { ret i1 %7 } +define i32 @PR88958_1(ptr %0, <2 x i64> %1) { +; SSE-LABEL: PR88958_1: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: ptest %xmm0, %xmm1 +; SSE-NEXT: sete %al +; SSE-NEXT: retq +; +; AVX-LABEL: PR88958_1: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vptest %xmm0, %xmm1 +; AVX-NEXT: sete %al +; AVX-NEXT: retq + %3 = load <2 x i64>, ptr %0 + %4 = tail call i32 @llvm.x86.sse41.ptestz(<2 x i64> %3, <2 x i64> %1) + ret i32 %4 +} + +define i32 @PR88958_2(ptr %0, <2 x i64> %1) { +; SSE-LABEL: PR88958_2: +; SSE: # %bb.0: +; SSE-NEXT: movdqa (%rdi), %xmm1 +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: ptest %xmm0, %xmm1 +; SSE-NEXT: setb %al +; SSE-NEXT: retq +; +; AVX-LABEL: PR88958_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: vptest %xmm0, %xmm1 +; AVX-NEXT: setb %al +; AVX-NEXT: retq + %3 = load <2 x i64>, ptr %0 + %4 = tail call i32 @llvm.x86.sse41.ptestc(<2 x i64> %3, <2 x i64> %1) + ret i32 %4 +} + declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone From d1a69e4a6ee0b04778da7728123c47eef2290564 Mon Sep 17 00:00:00 2001 From: shamithoke <152091883+shamithoke@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:16:10 +0530 Subject: [PATCH 223/300] Move gfni for bitreverse check out of SSSE3. (#88938) For lowering bitreverse using GFNI, the check is put under SSSE3. This can be pulled out of SSSE3. Co-authored-by: shami --- llvm/lib/Target/X86/X86ISelLowering.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f16a751a166d69..27107f554fccf1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1276,6 +1276,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); } + if (Subtarget.hasGFNI()) { + setOperationAction(ISD::BITREVERSE, MVT::i8, Custom); + setOperationAction(ISD::BITREVERSE, MVT::i16, Custom); + setOperationAction(ISD::BITREVERSE, MVT::i32, Custom); + setOperationAction(ISD::BITREVERSE, MVT::i64, Custom); + } + if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { setOperationAction(ISD::ABS, MVT::v16i8, Legal); setOperationAction(ISD::ABS, MVT::v8i16, Legal); @@ -1286,13 +1293,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTLZ, VT, Custom); } - if (Subtarget.hasGFNI()) { - setOperationAction(ISD::BITREVERSE, MVT::i8, Custom); - setOperationAction(ISD::BITREVERSE, MVT::i16, Custom); - setOperationAction(ISD::BITREVERSE, MVT::i32, Custom); - setOperationAction(ISD::BITREVERSE, MVT::i64, Custom); - } - // These might be better off as horizontal vector ops. setOperationAction(ISD::ADD, MVT::i16, Custom); setOperationAction(ISD::ADD, MVT::i32, Custom); From a16bb0701409376dee3a587ae351a6019d6de4e0 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 17 Apr 2024 09:16:46 +0000 Subject: [PATCH 224/300] [lldb][test] Improve invalid compiler error message I was debugging space separation issues when passing user arguments and noticed this error is really hard to read in that scenario. Put "" around the invalid compiler name so you can tell whether you have spaces around it that's causing the problem. --- lldb/packages/Python/lldbsuite/test/dotest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py index 8c29145ecc5272..2ec4a840b91675 100644 --- a/lldb/packages/Python/lldbsuite/test/dotest.py +++ b/lldb/packages/Python/lldbsuite/test/dotest.py @@ -248,7 +248,7 @@ def parseOptionsAndInitTestdirs(): configuration.compiler = which(args.compiler) if not is_exe(configuration.compiler): logging.error( - "%s is not a valid compiler executable; aborting...", args.compiler + '"%s" is not a valid compiler executable; aborting...', args.compiler ) sys.exit(-1) else: From d9a5aa8e2d755643cf4e7fa86aa831ed226fe54d Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 17 Apr 2024 18:22:05 +0900 Subject: [PATCH 225/300] [PatternMatch] Do not accept undef elements in m_AllOnes() and friends (#88217) Change all the cstval_pred_ty based PatternMatch helpers (things like m_AllOnes and m_Zero) to only allow poison elements inside vector splats, not undef elements. Historically, we used to represent non-demanded elements in vectors using undef. Nowadays, we use poison instead. As such, I believe that support for undef in vector splats is no longer useful. At the same time, while poison splat elements are pretty much always safe to ignore, this is not generally the case for undef elements. We have existing miscompiles in our tests due to this (see the masked-merge-*.ll tests changed here) and it's easy to miss such cases in the future, now that we write tests using poison instead of undef elements. I think overall, keeping support for undef elements no longer makes sense, and we should drop it. Once this is done consistently, I think we may also consider allowing poison in m_APInt by default, as doing that change is much less risky than doing the same with undef. This change involves a substantial amount of test changes. For most tests, I've just replaced undef with poison, as I don't think there is value in retaining both. For some tests (where the distinction between undef and poison is important), I've duplicated tests. --- llvm/include/llvm/IR/PatternMatch.h | 35 +---- llvm/lib/Analysis/InstructionSimplify.cpp | 23 ++- llvm/lib/IR/Constants.cpp | 2 +- .../InstCombine/InstCombineAndOrXor.cpp | 12 +- .../InstCombine/X86/x86-vector-shifts.ll | 30 ++-- llvm/test/Transforms/InstCombine/abs-1.ll | 16 +- .../Transforms/InstCombine/add-mask-neg.ll | 6 +- llvm/test/Transforms/InstCombine/add.ll | 28 ++-- .../Transforms/InstCombine/and-or-icmps.ll | 61 +++++--- .../test/Transforms/InstCombine/and-xor-or.ll | 4 +- llvm/test/Transforms/InstCombine/and.ll | 86 +++++------ llvm/test/Transforms/InstCombine/and2.ll | 19 ++- llvm/test/Transforms/InstCombine/ashr-lshr.ll | 28 ++-- .../Transforms/InstCombine/ashr-or-mul-abs.ll | 8 +- .../InstCombine/binop-and-shifts.ll | 68 ++++---- .../InstCombine/binop-of-displaced-shifts.ll | 22 +-- ...ern-between-zero-and-positive-threshold.ll | 20 +-- ...nt-low-bit-mask-and-icmp-eq-to-icmp-ule.ll | 6 +- ...nt-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll | 12 +- ...t-low-bit-mask-and-icmp-sge-to-icmp-sle.ll | 10 +- ...t-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll | 16 +- ...t-low-bit-mask-and-icmp-sle-to-icmp-sle.ll | 10 +- ...t-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll | 16 +- ...t-low-bit-mask-and-icmp-uge-to-icmp-ule.ll | 6 +- ...t-low-bit-mask-and-icmp-ugt-to-icmp-ugt.ll | 12 +- ...t-low-bit-mask-and-icmp-ule-to-icmp-ule.ll | 6 +- ...t-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll | 12 +- ...ze-low-bit-mask-and-icmp-eq-to-icmp-ule.ll | 8 +- ...ze-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll | 8 +- ...low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll | 20 +-- ...low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll | 20 +-- ...low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll | 8 +- ...low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll | 8 +- .../InstCombine/cast-int-icmp-eq-0.ll | 22 +-- .../InstCombine/cast-unsigned-icmp-eqcmp-0.ll | 36 ++--- llvm/test/Transforms/InstCombine/cast.ll | 56 +++---- .../test/Transforms/InstCombine/ctpop-cttz.ll | 6 +- llvm/test/Transforms/InstCombine/ctpop.ll | 17 +- .../Transforms/InstCombine/fabs-as-int.ll | 6 +- llvm/test/Transforms/InstCombine/fabs.ll | 4 +- llvm/test/Transforms/InstCombine/fast-math.ll | 6 +- .../Transforms/InstCombine/fcmp-special.ll | 20 +-- llvm/test/Transforms/InstCombine/fcmp.ll | 26 ++-- llvm/test/Transforms/InstCombine/fdiv.ll | 22 +-- llvm/test/Transforms/InstCombine/fma.ll | 8 +- llvm/test/Transforms/InstCombine/fmul.ll | 42 ++--- .../Transforms/InstCombine/fneg-as-int.ll | 6 +- .../InstCombine/fneg-fabs-as-int.ll | 6 +- llvm/test/Transforms/InstCombine/fneg.ll | 36 ++--- ...c-of-add-of-not-x-and-y-to-sub-x-from-y.ll | 20 +-- .../fold-sub-of-not-to-inc-of-add.ll | 6 +- llvm/test/Transforms/InstCombine/fpcast.ll | 6 +- llvm/test/Transforms/InstCombine/fsub.ll | 14 +- llvm/test/Transforms/InstCombine/funnel.ll | 54 +++---- .../get-lowbitmask-upto-and-including-bit.ll | 20 +-- .../hoist-negation-out-of-bias-calculation.ll | 6 +- .../hoist-not-from-ashr-operand.ll | 8 +- ...al-to-icmp-eq-of-lshr-val-by-bits-and-0.ll | 4 +- ...al-to-icmp-eq-of-lshr-val-by-bits-and-0.ll | 20 +-- ...al-to-icmp-eq-of-lshr-val-by-bits-and-0.ll | 6 +- ...al-to-icmp-ne-of-lshr-val-by-bits-and-0.ll | 6 +- ...al-to-icmp-ne-of-lshr-val-by-bits-and-0.ll | 4 +- ...al-to-icmp-ne-of-lshr-val-by-bits-and-0.ll | 20 +-- llvm/test/Transforms/InstCombine/icmp.ll | 62 ++++---- .../integer-round-up-pow2-alignment.ll | 70 ++++----- ...rt-variable-mask-in-masked-merge-vector.ll | 30 ++-- .../InstCombine/lshr-and-negC-icmpeq-zero.ll | 20 +-- .../lshr-and-signbit-icmpeq-zero.ll | 20 +-- .../InstCombine/masked-merge-add.ll | 17 +- .../Transforms/InstCombine/masked-merge-or.ll | 17 +- .../InstCombine/masked-merge-xor.ll | 17 +- .../Transforms/InstCombine/min-positive.ll | 8 +- .../Transforms/InstCombine/minmax-fold.ll | 11 +- .../InstCombine/minmax-intrinsics.ll | 40 ++--- .../InstCombine/mul-inseltpoison.ll | 22 +-- llvm/test/Transforms/InstCombine/mul.ll | 30 ++-- llvm/test/Transforms/InstCombine/not-add.ll | 16 +- llvm/test/Transforms/InstCombine/not.ll | 4 +- ...of-two-or-zero-when-comparing-with-zero.ll | 34 ++-- .../InstCombine/operand-complexity.ll | 22 +-- llvm/test/Transforms/InstCombine/or.ll | 52 +++---- ...nput-masking-after-truncation-variant-b.ll | 20 +-- ...nput-masking-after-truncation-variant-c.ll | 20 +-- ...nput-masking-after-truncation-variant-d.ll | 20 +-- ...dant-left-shift-input-masking-variant-b.ll | 20 +-- ...dant-left-shift-input-masking-variant-c.ll | 12 +- ...dant-left-shift-input-masking-variant-d.ll | 12 +- llvm/test/Transforms/InstCombine/pr53357.ll | 8 +- ...nput-masking-after-truncation-variant-b.ll | 16 +- ...nput-masking-after-truncation-variant-c.ll | 20 +-- ...nput-masking-after-truncation-variant-d.ll | 20 +-- ...dant-left-shift-input-masking-variant-b.ll | 18 +-- ...dant-left-shift-input-masking-variant-c.ll | 12 +- ...dant-left-shift-input-masking-variant-d.ll | 10 +- .../reuse-constant-from-select-in-icmp.ll | 26 ++-- llvm/test/Transforms/InstCombine/rotate.ll | 57 +++---- .../InstCombine/saturating-add-sub.ll | 18 +-- .../InstCombine/select-of-bittest.ll | 89 ++++++----- llvm/test/Transforms/InstCombine/select.ll | 33 ++-- .../Transforms/InstCombine/select_meta.ll | 8 +- .../set-lowbits-mask-canonicalize.ll | 20 +-- llvm/test/Transforms/InstCombine/sext.ll | 28 ++-- .../shift-amount-reassociation-in-bittest.ll | 64 ++++---- ...ount-reassociation-with-truncation-ashr.ll | 26 ++-- ...ount-reassociation-with-truncation-lshr.ll | 32 ++-- .../InstCombine/shift-amount-reassociation.ll | 26 ++-- .../Transforms/InstCombine/shift-logic.ll | 88 +++++------ .../InstCombine/shl-and-negC-icmpeq-zero.ll | 20 +-- .../shl-and-signbit-icmpeq-zero.ll | 20 +-- .../signmask-of-sext-vs-of-shl-of-zext.ll | 42 +++-- llvm/test/Transforms/InstCombine/sub-not.ll | 8 +- llvm/test/Transforms/InstCombine/sub.ll | 44 +++--- .../InstCombine/trunc-inseltpoison.ll | 98 ++++++------ .../InstCombine/trunc-shift-trunc.ll | 16 +- llvm/test/Transforms/InstCombine/trunc.ll | 102 ++++++------ ...k-of-overflow-check-via-udiv-of-allones.ll | 6 +- ...-mul-overflow-check-via-udiv-of-allones.ll | 6 +- ...signext-of-variable-high-bit-extraction.ll | 12 +- llvm/test/Transforms/InstCombine/vec_sext.ll | 14 +- .../InstCombine/vector-casts-inseltpoison.ll | 20 +-- .../Transforms/InstCombine/vector-casts.ll | 20 +-- .../Transforms/InstCombine/vector-urem.ll | 18 +-- .../test/Transforms/InstCombine/vector-xor.ll | 66 ++++---- .../InstCombine/zext-bool-add-sub.ll | 28 ++-- llvm/test/Transforms/InstSimplify/AndOrXor.ll | 36 ++--- llvm/test/Transforms/InstSimplify/call.ll | 10 +- llvm/test/Transforms/InstSimplify/compare.ll | 66 ++++---- ...constantfold-add-nuw-allones-to-allones.ll | 8 +- .../constantfold-shl-nuw-C-to-C.ll | 8 +- llvm/test/Transforms/InstSimplify/div.ll | 20 +-- .../InstSimplify/fast-math-strictfp.ll | 68 ++++---- .../test/Transforms/InstSimplify/fast-math.ll | 60 ++++---- llvm/test/Transforms/InstSimplify/fdiv.ll | 6 +- .../floating-point-arithmetic-strictfp.ll | 54 +++---- .../InstSimplify/floating-point-arithmetic.ll | 38 ++--- .../InstSimplify/floating-point-compare.ll | 30 ++-- .../Transforms/InstSimplify/fminmax-folds.ll | 36 ++--- llvm/test/Transforms/InstSimplify/fp-nan.ll | 13 +- .../InstSimplify/icmp-bool-constant.ll | 18 +-- .../InstSimplify/icmp-not-bool-constant.ll | 36 ++--- llvm/test/Transforms/InstSimplify/ldexp.ll | 5 +- llvm/test/Transforms/InstSimplify/mul.ll | 6 +- llvm/test/Transforms/InstSimplify/negate.ll | 12 +- llvm/test/Transforms/InstSimplify/or.ll | 73 +++++++-- llvm/test/Transforms/InstSimplify/ptrmask.ll | 3 +- llvm/test/Transforms/InstSimplify/rem.ll | 6 +- .../InstSimplify/saturating-add-sub.ll | 8 +- llvm/test/Transforms/InstSimplify/sdiv.ll | 6 +- .../InstSimplify/select-inseltpoison.ll | 38 ++--- llvm/test/Transforms/InstSimplify/select.ll | 38 ++--- llvm/test/Transforms/InstSimplify/shift.ll | 35 +++-- llvm/test/Transforms/InstSimplify/srem.ll | 6 +- llvm/test/Transforms/InstSimplify/sub.ll | 6 +- llvm/test/Transforms/InstSimplify/xor.ll | 20 ++- llvm/test/Transforms/Reassociate/inverses.ll | 6 +- llvm/test/Transforms/Reassociate/negation.ll | 8 +- llvm/unittests/IR/ConstantsTest.cpp | 39 ++++- llvm/unittests/IR/PatternMatch.cpp | 145 ++++++++++++++---- 158 files changed, 2042 insertions(+), 1839 deletions(-) diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index 92cb79d54afc29..98cc0e50376981 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -345,7 +345,7 @@ template inline constantint_match m_ConstantInt() { /// This helper class is used to match constant scalars, vector splats, /// and fixed width vectors that satisfy a specified predicate. -/// For fixed width vector constants, undefined elements are ignored. +/// For fixed width vector constants, poison elements are ignored. template struct cstval_pred_ty : public Predicate { template bool match(ITy *V) { @@ -364,19 +364,19 @@ struct cstval_pred_ty : public Predicate { // Non-splat vector constant: check each element for a match. unsigned NumElts = FVTy->getNumElements(); assert(NumElts != 0 && "Constant vector with no elements?"); - bool HasNonUndefElements = false; + bool HasNonPoisonElements = false; for (unsigned i = 0; i != NumElts; ++i) { Constant *Elt = C->getAggregateElement(i); if (!Elt) return false; - if (isa(Elt)) + if (isa(Elt)) continue; auto *CV = dyn_cast(Elt); if (!CV || !this->isValue(CV->getValue())) return false; - HasNonUndefElements = true; + HasNonPoisonElements = true; } - return HasNonUndefElements; + return HasNonPoisonElements; } } return false; @@ -2587,31 +2587,6 @@ m_Not(const ValTy &V) { return m_c_Xor(m_AllOnes(), V); } -template struct NotForbidUndef_match { - ValTy Val; - NotForbidUndef_match(const ValTy &V) : Val(V) {} - - template bool match(OpTy *V) { - // We do not use m_c_Xor because that could match an arbitrary APInt that is - // not -1 as C and then fail to match the other operand if it is -1. - // This code should still work even when both operands are constants. - Value *X; - const APInt *C; - if (m_Xor(m_Value(X), m_APIntForbidUndef(C)).match(V) && C->isAllOnes()) - return Val.match(X); - if (m_Xor(m_APIntForbidUndef(C), m_Value(X)).match(V) && C->isAllOnes()) - return Val.match(X); - return false; - } -}; - -/// Matches a bitwise 'not' as 'xor V, -1' or 'xor -1, V'. For vectors, the -/// constant value must be composed of only -1 scalar elements. -template -inline NotForbidUndef_match m_NotForbidUndef(const ValTy &V) { - return NotForbidUndef_match(V); -} - /// Matches an SMin with LHS and RHS in either order. template inline MaxMin_match diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 8955de6375dec4..06ba5ca4c6b352 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -1513,7 +1513,7 @@ static Value *simplifyAShrInst(Value *Op0, Value *Op1, bool IsExact, // -1 >>a X --> -1 // (-1 << X) a>> X --> -1 - // Do not return Op0 because it may contain undef elements if it's a vector. + // We could return the original -1 constant to preserve poison elements. if (match(Op0, m_AllOnes()) || match(Op0, m_Shl(m_AllOnes(), m_Specific(Op1)))) return Constant::getAllOnesValue(Op0->getType()); @@ -2281,7 +2281,7 @@ static Value *simplifyOrLogic(Value *X, Value *Y) { // (B ^ ~A) | (A & B) --> B ^ ~A // (~A ^ B) | (B & A) --> ~A ^ B // (B ^ ~A) | (B & A) --> B ^ ~A - if (match(X, m_c_Xor(m_NotForbidUndef(m_Value(A)), m_Value(B))) && + if (match(X, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) && match(Y, m_c_And(m_Specific(A), m_Specific(B)))) return X; @@ -2298,31 +2298,29 @@ static Value *simplifyOrLogic(Value *X, Value *Y) { // (B & ~A) | ~(A | B) --> ~A // (B & ~A) | ~(B | A) --> ~A Value *NotA; - if (match(X, - m_c_And(m_CombineAnd(m_Value(NotA), m_NotForbidUndef(m_Value(A))), - m_Value(B))) && + if (match(X, m_c_And(m_CombineAnd(m_Value(NotA), m_Not(m_Value(A))), + m_Value(B))) && match(Y, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) return NotA; // The same is true of Logical And // TODO: This could share the logic of the version above if there was a // version of LogicalAnd that allowed more than just i1 types. - if (match(X, m_c_LogicalAnd( - m_CombineAnd(m_Value(NotA), m_NotForbidUndef(m_Value(A))), - m_Value(B))) && + if (match(X, m_c_LogicalAnd(m_CombineAnd(m_Value(NotA), m_Not(m_Value(A))), + m_Value(B))) && match(Y, m_Not(m_c_LogicalOr(m_Specific(A), m_Specific(B))))) return NotA; // ~(A ^ B) | (A & B) --> ~(A ^ B) // ~(A ^ B) | (B & A) --> ~(A ^ B) Value *NotAB; - if (match(X, m_CombineAnd(m_NotForbidUndef(m_Xor(m_Value(A), m_Value(B))), + if (match(X, m_CombineAnd(m_Not(m_Xor(m_Value(A), m_Value(B))), m_Value(NotAB))) && match(Y, m_c_And(m_Specific(A), m_Specific(B)))) return NotAB; // ~(A & B) | (A ^ B) --> ~(A & B) // ~(A & B) | (B ^ A) --> ~(A & B) - if (match(X, m_CombineAnd(m_NotForbidUndef(m_And(m_Value(A), m_Value(B))), + if (match(X, m_CombineAnd(m_Not(m_And(m_Value(A), m_Value(B))), m_Value(NotAB))) && match(Y, m_c_Xor(m_Specific(A), m_Specific(B)))) return NotAB; @@ -2552,9 +2550,8 @@ static Value *simplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, // The 'not' op must contain a complete -1 operand (no undef elements for // vector) for the transform to be safe. Value *NotA; - if (match(X, - m_c_Or(m_CombineAnd(m_NotForbidUndef(m_Value(A)), m_Value(NotA)), - m_Value(B))) && + if (match(X, m_c_Or(m_CombineAnd(m_Not(m_Value(A)), m_Value(NotA)), + m_Value(B))) && match(Y, m_c_And(m_Specific(A), m_Specific(B)))) return NotA; diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index a5fb497f54ed15..45b359a94b3ab7 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -316,7 +316,7 @@ bool Constant::isElementWiseEqual(Value *Y) const { Constant *C0 = ConstantExpr::getBitCast(const_cast(this), IntTy); Constant *C1 = ConstantExpr::getBitCast(cast(Y), IntTy); Constant *CmpEq = ConstantExpr::getICmp(ICmpInst::ICMP_EQ, C0, C1); - return isa(CmpEq) || match(CmpEq, m_One()); + return isa(CmpEq) || match(CmpEq, m_One()); } static bool diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index d311690be64f16..0f4fbf5bbfbbdc 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -2538,6 +2538,8 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { } } + // and(shl(zext(X), Y), SignMask) -> and(sext(X), SignMask) + // where Y is a valid shift amount. if (match(&I, m_And(m_OneUse(m_Shl(m_ZExt(m_Value(X)), m_Value(Y))), m_SignMask())) && match(Y, m_SpecificInt_ICMP( @@ -2546,15 +2548,7 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { Ty->getScalarSizeInBits() - X->getType()->getScalarSizeInBits())))) { auto *SExt = Builder.CreateSExt(X, Ty, X->getName() + ".signext"); - auto *SanitizedSignMask = cast(Op1); - // We must be careful with the undef elements of the sign bit mask, however: - // the mask elt can be undef iff the shift amount for that lane was undef, - // otherwise we need to sanitize undef masks to zero. - SanitizedSignMask = Constant::replaceUndefsWith( - SanitizedSignMask, ConstantInt::getNullValue(Ty->getScalarType())); - SanitizedSignMask = - Constant::mergeUndefsWith(SanitizedSignMask, cast(Y)); - return BinaryOperator::CreateAnd(SExt, SanitizedSignMask); + return BinaryOperator::CreateAnd(SExt, Op1); } if (Instruction *Z = narrowMaskedBinOp(I)) diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll index 4600a6654a3622..b1e5fa4f9e1c94 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll @@ -2032,23 +2032,23 @@ define <4 x i64> @avx2_psrlv_q_256_allbig(<4 x i64> %v) { ret <4 x i64> %1 } -; The shift amount is 0 (the undef lane could be 0), so we return the unshifted input. +; The shift amount is 0 (the poison lane could be 0), so we return the unshifted input. -define <2 x i64> @avx2_psrlv_q_128_undef(<2 x i64> %v) { -; CHECK-LABEL: @avx2_psrlv_q_128_undef( +define <2 x i64> @avx2_psrlv_q_128_poison(<2 x i64> %v) { +; CHECK-LABEL: @avx2_psrlv_q_128_poison( ; CHECK-NEXT: ret <2 x i64> [[V:%.*]] ; - %1 = insertelement <2 x i64> , i64 undef, i64 1 + %1 = insertelement <2 x i64> , i64 poison, i64 1 %2 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> %1) ret <2 x i64> %2 } -define <4 x i64> @avx2_psrlv_q_256_undef(<4 x i64> %v) { -; CHECK-LABEL: @avx2_psrlv_q_256_undef( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i64> [[V:%.*]], +define <4 x i64> @avx2_psrlv_q_256_poison(<4 x i64> %v) { +; CHECK-LABEL: @avx2_psrlv_q_256_poison( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i64> [[V:%.*]], ; CHECK-NEXT: ret <4 x i64> [[TMP1]] ; - %1 = insertelement <4 x i64> , i64 undef, i64 0 + %1 = insertelement <4 x i64> , i64 poison, i64 0 %2 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> %1) ret <4 x i64> %2 } @@ -2435,21 +2435,21 @@ define <4 x i64> @avx2_psllv_q_256_allbig(<4 x i64> %v) { ; The shift amount is 0 (the undef lane could be 0), so we return the unshifted input. -define <2 x i64> @avx2_psllv_q_128_undef(<2 x i64> %v) { -; CHECK-LABEL: @avx2_psllv_q_128_undef( +define <2 x i64> @avx2_psllv_q_128_poison(<2 x i64> %v) { +; CHECK-LABEL: @avx2_psllv_q_128_poison( ; CHECK-NEXT: ret <2 x i64> [[V:%.*]] ; - %1 = insertelement <2 x i64> , i64 undef, i64 1 + %1 = insertelement <2 x i64> , i64 poison, i64 1 %2 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> %1) ret <2 x i64> %2 } -define <4 x i64> @avx2_psllv_q_256_undef(<4 x i64> %v) { -; CHECK-LABEL: @avx2_psllv_q_256_undef( -; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i64> [[V:%.*]], +define <4 x i64> @avx2_psllv_q_256_poison(<4 x i64> %v) { +; CHECK-LABEL: @avx2_psllv_q_256_poison( +; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i64> [[V:%.*]], ; CHECK-NEXT: ret <4 x i64> [[TMP1]] ; - %1 = insertelement <4 x i64> , i64 undef, i64 0 + %1 = insertelement <4 x i64> , i64 poison, i64 0 %2 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> %1) ret <4 x i64> %2 } diff --git a/llvm/test/Transforms/InstCombine/abs-1.ll b/llvm/test/Transforms/InstCombine/abs-1.ll index 7355c560c820b2..32bd7a37053ed6 100644 --- a/llvm/test/Transforms/InstCombine/abs-1.ll +++ b/llvm/test/Transforms/InstCombine/abs-1.ll @@ -63,14 +63,14 @@ define <2 x i8> @abs_canonical_2(<2 x i8> %x) { ret <2 x i8> %abs } -; Even if a constant has undef elements. +; Even if a constant has poison elements. -define <2 x i8> @abs_canonical_2_vec_undef_elts(<2 x i8> %x) { -; CHECK-LABEL: @abs_canonical_2_vec_undef_elts( +define <2 x i8> @abs_canonical_2_vec_poison_elts(<2 x i8> %x) { +; CHECK-LABEL: @abs_canonical_2_vec_poison_elts( ; CHECK-NEXT: [[ABS:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[X:%.*]], i1 false) ; CHECK-NEXT: ret <2 x i8> [[ABS]] ; - %cmp = icmp sgt <2 x i8> %x, + %cmp = icmp sgt <2 x i8> %x, %neg = sub <2 x i8> zeroinitializer, %x %abs = select <2 x i1> %cmp, <2 x i8> %x, <2 x i8> %neg ret <2 x i8> %abs @@ -208,15 +208,15 @@ define <2 x i8> @nabs_canonical_2(<2 x i8> %x) { ret <2 x i8> %abs } -; Even if a constant has undef elements. +; Even if a constant has poison elements. -define <2 x i8> @nabs_canonical_2_vec_undef_elts(<2 x i8> %x) { -; CHECK-LABEL: @nabs_canonical_2_vec_undef_elts( +define <2 x i8> @nabs_canonical_2_vec_poison_elts(<2 x i8> %x) { +; CHECK-LABEL: @nabs_canonical_2_vec_poison_elts( ; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[X:%.*]], i1 false) ; CHECK-NEXT: [[ABS:%.*]] = sub <2 x i8> zeroinitializer, [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[ABS]] ; - %cmp = icmp sgt <2 x i8> %x, + %cmp = icmp sgt <2 x i8> %x, %neg = sub <2 x i8> zeroinitializer, %x %abs = select <2 x i1> %cmp, <2 x i8> %neg, <2 x i8> %x ret <2 x i8> %abs diff --git a/llvm/test/Transforms/InstCombine/add-mask-neg.ll b/llvm/test/Transforms/InstCombine/add-mask-neg.ll index 5fad6155d348e2..0e579f30976079 100644 --- a/llvm/test/Transforms/InstCombine/add-mask-neg.ll +++ b/llvm/test/Transforms/InstCombine/add-mask-neg.ll @@ -89,8 +89,8 @@ define <2 x i32> @dec_mask_neg_v2i32(<2 x i32> %X) { ret <2 x i32> %dec } -define <2 x i32> @dec_mask_neg_v2i32_undef(<2 x i32> %X) { -; CHECK-LABEL: @dec_mask_neg_v2i32_undef( +define <2 x i32> @dec_mask_neg_v2i32_poison(<2 x i32> %X) { +; CHECK-LABEL: @dec_mask_neg_v2i32_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[X]], ; CHECK-NEXT: [[DEC:%.*]] = and <2 x i32> [[TMP1]], [[TMP2]] @@ -98,7 +98,7 @@ define <2 x i32> @dec_mask_neg_v2i32_undef(<2 x i32> %X) { ; %neg = sub <2 x i32> zeroinitializer, %X %mask = and <2 x i32> %neg, %X - %dec = add <2 x i32> %mask, + %dec = add <2 x i32> %mask, ret <2 x i32> %dec } diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll index 408b0c6559b001..39b4ad80550889 100644 --- a/llvm/test/Transforms/InstCombine/add.ll +++ b/llvm/test/Transforms/InstCombine/add.ll @@ -150,24 +150,24 @@ define i32 @test5_add_nsw(i32 %A, i32 %B) { ret i32 %D } -define <2 x i8> @neg_op0_vec_undef_elt(<2 x i8> %a, <2 x i8> %b) { -; CHECK-LABEL: @neg_op0_vec_undef_elt( +define <2 x i8> @neg_op0_vec_poison_elt(<2 x i8> %a, <2 x i8> %b) { +; CHECK-LABEL: @neg_op0_vec_poison_elt( ; CHECK-NEXT: [[R:%.*]] = sub <2 x i8> [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; - %nega = sub <2 x i8> , %a + %nega = sub <2 x i8> , %a %r = add <2 x i8> %nega, %b ret <2 x i8> %r } -define <2 x i8> @neg_neg_vec_undef_elt(<2 x i8> %a, <2 x i8> %b) { -; CHECK-LABEL: @neg_neg_vec_undef_elt( +define <2 x i8> @neg_neg_vec_poison_elt(<2 x i8> %a, <2 x i8> %b) { +; CHECK-LABEL: @neg_neg_vec_poison_elt( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i8> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[R:%.*]] = sub <2 x i8> zeroinitializer, [[TMP1]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; - %nega = sub <2 x i8> , %a - %negb = sub <2 x i8> , %b + %nega = sub <2 x i8> , %a + %negb = sub <2 x i8> , %b %r = add <2 x i8> %nega, %negb ret <2 x i8> %r } @@ -1196,14 +1196,14 @@ define <2 x i32> @test44_vec_non_matching(<2 x i32> %A) { ret <2 x i32> %C } -define <2 x i32> @test44_vec_undef(<2 x i32> %A) { -; CHECK-LABEL: @test44_vec_undef( -; CHECK-NEXT: [[B:%.*]] = or <2 x i32> [[A:%.*]], -; CHECK-NEXT: [[C:%.*]] = add <2 x i32> [[B]], +define <2 x i32> @test44_vec_poison(<2 x i32> %A) { +; CHECK-LABEL: @test44_vec_poison( +; CHECK-NEXT: [[B:%.*]] = or <2 x i32> [[A:%.*]], +; CHECK-NEXT: [[C:%.*]] = add nsw <2 x i32> [[B]], ; CHECK-NEXT: ret <2 x i32> [[C]] ; - %B = or <2 x i32> %A, - %C = add <2 x i32> %B, + %B = or <2 x i32> %A, + %C = add <2 x i32> %B, ret <2 x i32> %C } @@ -2983,7 +2983,7 @@ define i8 @signum_i8_i8_use3(i8 %x) { ret i8 %r } -; poison/undef is ok to propagate in shift amount +; poison is ok to propagate in shift amount ; complexity canonicalization guarantees that shift is op0 of add define <2 x i5> @signum_v2i5_v2i5(<2 x i5> %x) { diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll index 63b11d0c0bc086..c20f48a985b3ee 100644 --- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll +++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll @@ -952,8 +952,8 @@ define i1 @substitute_constant_or_ne_uge_commute_logical(i8 %x, i8 %y) { ; Negative test - not safe to substitute vector constant with undef element -define <2 x i1> @substitute_constant_or_ne_slt_swap_vec(<2 x i8> %x, <2 x i8> %y) { -; CHECK-LABEL: @substitute_constant_or_ne_slt_swap_vec( +define <2 x i1> @substitute_constant_or_ne_slt_swap_vec_undef(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: @substitute_constant_or_ne_slt_swap_vec_undef( ; CHECK-NEXT: [[C1:%.*]] = icmp ne <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[C2:%.*]] = icmp slt <2 x i8> [[Y:%.*]], [[X]] ; CHECK-NEXT: [[R:%.*]] = or <2 x i1> [[C1]], [[C2]] @@ -965,14 +965,29 @@ define <2 x i1> @substitute_constant_or_ne_slt_swap_vec(<2 x i8> %x, <2 x i8> %y ret <2 x i1> %r } +; TODO: The poison case would be valid to fold. + +define <2 x i1> @substitute_constant_or_ne_slt_swap_vec_poison(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: @substitute_constant_or_ne_slt_swap_vec_poison( +; CHECK-NEXT: [[C1:%.*]] = icmp ne <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[C2:%.*]] = icmp slt <2 x i8> [[Y:%.*]], [[X]] +; CHECK-NEXT: [[R:%.*]] = or <2 x i1> [[C1]], [[C2]] +; CHECK-NEXT: ret <2 x i1> [[R]] +; + %c1 = icmp ne <2 x i8> %x, + %c2 = icmp slt <2 x i8> %y, %x + %r = or <2 x i1> %c1, %c2 + ret <2 x i1> %r +} + define <2 x i1> @substitute_constant_or_ne_slt_swap_vec_logical(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @substitute_constant_or_ne_slt_swap_vec_logical( -; CHECK-NEXT: [[C1:%.*]] = icmp ne <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[C1:%.*]] = icmp ne <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[C2:%.*]] = icmp slt <2 x i8> [[Y:%.*]], [[X]] ; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[C1]], <2 x i1> , <2 x i1> [[C2]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %c1 = icmp ne <2 x i8> %x, + %c1 = icmp ne <2 x i8> %x, %c2 = icmp slt <2 x i8> %y, %x %r = select <2 x i1> %c1, <2 x i1> , <2 x i1> %c2 ret <2 x i1> %r @@ -2497,29 +2512,29 @@ define <2 x i1> @icmp_eq_m1_and_eq_m1(<2 x i8> %x, <2 x i8> %y) { ; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i8> [[TMP1]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %rx = icmp eq <2 x i8> %x, - %ry = icmp eq <2 x i8> %y, + %rx = icmp eq <2 x i8> %x, + %ry = icmp eq <2 x i8> %y, %r = and <2 x i1> %rx, %ry ret <2 x i1> %r } -define <2 x i1> @icmp_eq_m1_and_eq_undef_m1(<2 x i8> %x, <2 x i8> %y) { -; CHECK-LABEL: @icmp_eq_m1_and_eq_undef_m1( +define <2 x i1> @icmp_eq_m1_and_eq_poison_m1(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: @icmp_eq_m1_and_eq_poison_m1( ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i8> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i8> [[TMP1]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %rx = icmp eq <2 x i8> %x, - %ry = icmp eq <2 x i8> %y, + %rx = icmp eq <2 x i8> %x, + %ry = icmp eq <2 x i8> %y, %r = and <2 x i1> %rx, %ry ret <2 x i1> %r } -define <2 x i1> @icmp_eq_undef_and_eq_m1_m2(<2 x i8> %x, <2 x i8> %y) { -; CHECK-LABEL: @icmp_eq_undef_and_eq_m1_m2( -; CHECK-NEXT: ret <2 x i1> zeroinitializer +define <2 x i1> @icmp_eq_poison_and_eq_m1_m2(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: @icmp_eq_poison_and_eq_m1_m2( +; CHECK-NEXT: ret <2 x i1> poison ; - %rx = icmp eq <2 x i8> %x, + %rx = icmp eq <2 x i8> %x, %ry = icmp eq <2 x i8> %y, %r = and <2 x i1> %rx, %ry ret <2 x i1> %r @@ -2527,13 +2542,13 @@ define <2 x i1> @icmp_eq_undef_and_eq_m1_m2(<2 x i8> %x, <2 x i8> %y) { define <2 x i1> @icmp_ne_m1_and_ne_m1_fail(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @icmp_ne_m1_and_ne_m1_fail( -; CHECK-NEXT: [[RX:%.*]] = icmp ne <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[RY:%.*]] = icmp ne <2 x i8> [[Y:%.*]], +; CHECK-NEXT: [[RX:%.*]] = icmp ne <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[RY:%.*]] = icmp ne <2 x i8> [[Y:%.*]], ; CHECK-NEXT: [[R:%.*]] = and <2 x i1> [[RX]], [[RY]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %rx = icmp ne <2 x i8> %x, - %ry = icmp ne <2 x i8> %y, + %rx = icmp ne <2 x i8> %x, + %ry = icmp ne <2 x i8> %y, %r = and <2 x i1> %rx, %ry ret <2 x i1> %r } @@ -2541,13 +2556,13 @@ define <2 x i1> @icmp_ne_m1_and_ne_m1_fail(<2 x i8> %x, <2 x i8> %y) { define <2 x i1> @icmp_eq_m1_or_eq_m1_fail(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @icmp_eq_m1_or_eq_m1_fail( -; CHECK-NEXT: [[RX:%.*]] = icmp eq <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[RY:%.*]] = icmp eq <2 x i8> [[Y:%.*]], +; CHECK-NEXT: [[RX:%.*]] = icmp eq <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[RY:%.*]] = icmp eq <2 x i8> [[Y:%.*]], ; CHECK-NEXT: [[R:%.*]] = or <2 x i1> [[RX]], [[RY]] ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %rx = icmp eq <2 x i8> %x, - %ry = icmp eq <2 x i8> %y, + %rx = icmp eq <2 x i8> %x, + %ry = icmp eq <2 x i8> %y, %r = or <2 x i1> %rx, %ry ret <2 x i1> %r } @@ -2560,7 +2575,7 @@ define <2 x i1> @icmp_ne_m1_or_ne_m1(<2 x i8> %x, <2 x i8> %y) { ; CHECK-NEXT: ret <2 x i1> [[R]] ; %rx = icmp ne <2 x i8> %x, - %ry = icmp ne <2 x i8> %y, + %ry = icmp ne <2 x i8> %y, %r = or <2 x i1> %rx, %ry ret <2 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/and-xor-or.ll b/llvm/test/Transforms/InstCombine/and-xor-or.ll index d072dc15cbb2c9..b26d6e16c2db27 100644 --- a/llvm/test/Transforms/InstCombine/and-xor-or.ll +++ b/llvm/test/Transforms/InstCombine/and-xor-or.ll @@ -843,7 +843,7 @@ define <2 x i6> @not_or_or_not_2i6(<2 x i6> %a0, <2 x i6> %b, <2 x i6> %c) { ; %a = sdiv <2 x i6> , %a0 ; thwart complexity-based canonicalization %not1 = xor <2 x i6> %b, - %not2 = xor <2 x i6> %c, + %not2 = xor <2 x i6> %c, %or1 = or <2 x i6> %a, %not1 %or2 = or <2 x i6> %or1, %not2 ret <2 x i6> %or2 @@ -4018,7 +4018,7 @@ define <2 x i4> @and_orn_xor_commute1(<2 x i4> %a, <2 x i4> %b) { ; CHECK-NEXT: ret <2 x i4> [[R]] ; %xor = xor <2 x i4> %a, %b - %nota = xor <2 x i4> %a, + %nota = xor <2 x i4> %a, %or = or <2 x i4> %nota, %b %r = and <2 x i4> %xor, %or ret <2 x i4> %r diff --git a/llvm/test/Transforms/InstCombine/and.ll b/llvm/test/Transforms/InstCombine/and.ll index ffd8c2a06c86e4..b5250fc1a7849d 100644 --- a/llvm/test/Transforms/InstCombine/and.ll +++ b/llvm/test/Transforms/InstCombine/and.ll @@ -752,16 +752,16 @@ define <2 x i64> @test36_uniform(<2 x i32> %X) { ret <2 x i64> %res } -define <2 x i64> @test36_undef(<2 x i32> %X) { -; CHECK-LABEL: @test36_undef( +define <2 x i64> @test36_poison(<2 x i32> %X) { +; CHECK-LABEL: @test36_poison( ; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i32> [[X:%.*]] to <2 x i64> -; CHECK-NEXT: [[ZSUB:%.*]] = add <2 x i64> [[ZEXT]], -; CHECK-NEXT: [[RES:%.*]] = and <2 x i64> [[ZSUB]], +; CHECK-NEXT: [[ZSUB:%.*]] = add nuw nsw <2 x i64> [[ZEXT]], +; CHECK-NEXT: [[RES:%.*]] = and <2 x i64> [[ZSUB]], ; CHECK-NEXT: ret <2 x i64> [[RES]] ; %zext = zext <2 x i32> %X to <2 x i64> - %zsub = add <2 x i64> %zext, - %res = and <2 x i64> %zsub, + %zsub = add <2 x i64> %zext, + %res = and <2 x i64> %zsub, ret <2 x i64> %res } @@ -1630,16 +1630,16 @@ define <2 x i8> @lowmask_add_splat(<2 x i8> %x, ptr %p) { ret <2 x i8> %r } -define <2 x i8> @lowmask_add_splat_undef(<2 x i8> %x, ptr %p) { -; CHECK-LABEL: @lowmask_add_splat_undef( -; CHECK-NEXT: [[A:%.*]] = add <2 x i8> [[X:%.*]], +define <2 x i8> @lowmask_add_splat_poison(<2 x i8> %x, ptr %p) { +; CHECK-LABEL: @lowmask_add_splat_poison( +; CHECK-NEXT: [[A:%.*]] = add <2 x i8> [[X:%.*]], ; CHECK-NEXT: store <2 x i8> [[A]], ptr [[P:%.*]], align 2 -; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[A]], +; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[X]], ; CHECK-NEXT: ret <2 x i8> [[R]] ; - %a = add <2 x i8> %x, ; 0xc0 + %a = add <2 x i8> %x, ; 0xc0 store <2 x i8> %a, ptr %p - %r = and <2 x i8> %a, ; 0x20 + %r = and <2 x i8> %a, ; 0x20 ret <2 x i8> %r } @@ -1679,14 +1679,14 @@ define <2 x i8> @flip_masked_bit_uniform(<2 x i8> %A) { ret <2 x i8> %C } -define <2 x i8> @flip_masked_bit_undef(<2 x i8> %A) { -; CHECK-LABEL: @flip_masked_bit_undef( +define <2 x i8> @flip_masked_bit_poison(<2 x i8> %A) { +; CHECK-LABEL: @flip_masked_bit_poison( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[A:%.*]], -; CHECK-NEXT: [[C:%.*]] = and <2 x i8> [[TMP1]], +; CHECK-NEXT: [[C:%.*]] = and <2 x i8> [[TMP1]], ; CHECK-NEXT: ret <2 x i8> [[C]] ; - %B = add <2 x i8> %A, - %C = and <2 x i8> %B, + %B = add <2 x i8> %A, + %C = and <2 x i8> %B, ret <2 x i8> %C } @@ -2004,7 +2004,7 @@ define i16 @invert_signbit_splat_mask_use2(i8 %x, i16 %y) { ret i16 %r } -; extra use of sext is ok +; extra use of sext is ok define i16 @invert_signbit_splat_mask_use3(i8 %x, i16 %y) { ; CHECK-LABEL: @invert_signbit_splat_mask_use3( @@ -2120,41 +2120,40 @@ define <3 x i16> @shl_lshr_pow2_const_case1_non_uniform_vec_negative(<3 x i16> % ret <3 x i16> %r } -define <3 x i16> @shl_lshr_pow2_const_case1_undef1_vec(<3 x i16> %x) { -; CHECK-LABEL: @shl_lshr_pow2_const_case1_undef1_vec( +define <3 x i16> @shl_lshr_pow2_const_case1_poison1_vec(<3 x i16> %x) { +; CHECK-LABEL: @shl_lshr_pow2_const_case1_poison1_vec( ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <3 x i16> [[X:%.*]], ; CHECK-NEXT: [[R:%.*]] = select <3 x i1> [[TMP1]], <3 x i16> , <3 x i16> zeroinitializer ; CHECK-NEXT: ret <3 x i16> [[R]] ; - %shl = shl <3 x i16> , %x + %shl = shl <3 x i16> , %x %lshr = lshr <3 x i16> %shl, %r = and <3 x i16> %lshr, ret <3 x i16> %r } -define <3 x i16> @shl_lshr_pow2_const_case1_undef2_vec(<3 x i16> %x) { -; CHECK-LABEL: @shl_lshr_pow2_const_case1_undef2_vec( -; CHECK-NEXT: [[SHL:%.*]] = shl <3 x i16> , [[X:%.*]] -; CHECK-NEXT: [[LSHR:%.*]] = lshr <3 x i16> [[SHL]], -; CHECK-NEXT: [[R:%.*]] = and <3 x i16> [[LSHR]], +define <3 x i16> @shl_lshr_pow2_const_case1_poison2_vec(<3 x i16> %x) { +; CHECK-LABEL: @shl_lshr_pow2_const_case1_poison2_vec( +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <3 x i16> [[X:%.*]], +; CHECK-NEXT: [[R:%.*]] = select <3 x i1> [[TMP1]], <3 x i16> , <3 x i16> zeroinitializer ; CHECK-NEXT: ret <3 x i16> [[R]] ; %shl = shl <3 x i16> , %x - %lshr = lshr <3 x i16> %shl, + %lshr = lshr <3 x i16> %shl, %r = and <3 x i16> %lshr, ret <3 x i16> %r } -define <3 x i16> @shl_lshr_pow2_const_case1_undef3_vec(<3 x i16> %x) { -; CHECK-LABEL: @shl_lshr_pow2_const_case1_undef3_vec( +define <3 x i16> @shl_lshr_pow2_const_case1_poison3_vec(<3 x i16> %x) { +; CHECK-LABEL: @shl_lshr_pow2_const_case1_poison3_vec( ; CHECK-NEXT: [[SHL:%.*]] = shl <3 x i16> , [[X:%.*]] ; CHECK-NEXT: [[LSHR:%.*]] = lshr <3 x i16> [[SHL]], -; CHECK-NEXT: [[R:%.*]] = and <3 x i16> [[LSHR]], +; CHECK-NEXT: [[R:%.*]] = and <3 x i16> [[LSHR]], ; CHECK-NEXT: ret <3 x i16> [[R]] ; %shl = shl <3 x i16> , %x %lshr = lshr <3 x i16> %shl, - %r = and <3 x i16> %lshr, + %r = and <3 x i16> %lshr, ret <3 x i16> %r } @@ -2417,40 +2416,41 @@ define <3 x i16> @lshr_shl_pow2_const_case1_non_uniform_vec_negative(<3 x i16> % ret <3 x i16> %r } -define <3 x i16> @lshr_shl_pow2_const_case1_undef1_vec(<3 x i16> %x) { -; CHECK-LABEL: @lshr_shl_pow2_const_case1_undef1_vec( +define <3 x i16> @lshr_shl_pow2_const_case1_poison1_vec(<3 x i16> %x) { +; CHECK-LABEL: @lshr_shl_pow2_const_case1_poison1_vec( ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <3 x i16> [[X:%.*]], ; CHECK-NEXT: [[R:%.*]] = select <3 x i1> [[TMP1]], <3 x i16> , <3 x i16> zeroinitializer ; CHECK-NEXT: ret <3 x i16> [[R]] ; - %lshr = lshr <3 x i16> , %x + %lshr = lshr <3 x i16> , %x %shl = shl <3 x i16> %lshr, %r = and <3 x i16> %shl, ret <3 x i16> %r } -define <3 x i16> @lshr_shl_pow2_const_case1_undef2_vec(<3 x i16> %x) { -; CHECK-LABEL: @lshr_shl_pow2_const_case1_undef2_vec( -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <3 x i16> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = select <3 x i1> [[TMP1]], <3 x i16> , <3 x i16> zeroinitializer +define <3 x i16> @lshr_shl_pow2_const_case1_poison2_vec(<3 x i16> %x) { +; CHECK-LABEL: @lshr_shl_pow2_const_case1_poison2_vec( +; CHECK-NEXT: [[LSHR:%.*]] = lshr <3 x i16> , [[X:%.*]] +; CHECK-NEXT: [[SHL:%.*]] = shl <3 x i16> [[LSHR]], +; CHECK-NEXT: [[R:%.*]] = and <3 x i16> [[SHL]], ; CHECK-NEXT: ret <3 x i16> [[R]] ; %lshr = lshr <3 x i16> , %x - %shl = shl <3 x i16> %lshr, + %shl = shl <3 x i16> %lshr, %r = and <3 x i16> %shl, ret <3 x i16> %r } -define <3 x i16> @lshr_shl_pow2_const_case1_undef3_vec(<3 x i16> %x) { -; CHECK-LABEL: @lshr_shl_pow2_const_case1_undef3_vec( +define <3 x i16> @lshr_shl_pow2_const_case1_poison3_vec(<3 x i16> %x) { +; CHECK-LABEL: @lshr_shl_pow2_const_case1_poison3_vec( ; CHECK-NEXT: [[LSHR:%.*]] = lshr <3 x i16> , [[X:%.*]] ; CHECK-NEXT: [[SHL:%.*]] = shl <3 x i16> [[LSHR]], -; CHECK-NEXT: [[R:%.*]] = and <3 x i16> [[SHL]], +; CHECK-NEXT: [[R:%.*]] = and <3 x i16> [[SHL]], ; CHECK-NEXT: ret <3 x i16> [[R]] ; %lshr = lshr <3 x i16> , %x %shl = shl <3 x i16> %lshr, - %r = and <3 x i16> %shl, + %r = and <3 x i16> %shl, ret <3 x i16> %r } diff --git a/llvm/test/Transforms/InstCombine/and2.ll b/llvm/test/Transforms/InstCombine/and2.ll index 73bdadc86710e7..104486e7638f56 100644 --- a/llvm/test/Transforms/InstCombine/and2.ll +++ b/llvm/test/Transforms/InstCombine/and2.ll @@ -168,14 +168,14 @@ define <2 x i8> @and1_shl1_is_cmp_eq_0_vec(<2 x i8> %x) { ret <2 x i8> %and } -define <2 x i8> @and1_shl1_is_cmp_eq_0_vec_undef(<2 x i8> %x) { -; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_vec_undef( +define <2 x i8> @and1_shl1_is_cmp_eq_0_vec_poison(<2 x i8> %x) { +; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[X:%.*]], zeroinitializer ; CHECK-NEXT: [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8> ; CHECK-NEXT: ret <2 x i8> [[AND]] ; - %sh = shl <2 x i8> , %x - %and = and <2 x i8> %sh, + %sh = shl <2 x i8> , %x + %and = and <2 x i8> %sh, ret <2 x i8> %and } @@ -215,14 +215,13 @@ define <2 x i8> @and1_lshr1_is_cmp_eq_0_vec(<2 x i8> %x) { ret <2 x i8> %and } -define <2 x i8> @and1_lshr1_is_cmp_eq_0_vec_undef(<2 x i8> %x) { -; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_vec_undef( -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[X:%.*]], zeroinitializer -; CHECK-NEXT: [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8> +define <2 x i8> @and1_lshr1_is_cmp_eq_0_vec_poison(<2 x i8> %x) { +; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_vec_poison( +; CHECK-NEXT: [[AND:%.*]] = lshr <2 x i8> , [[X:%.*]] ; CHECK-NEXT: ret <2 x i8> [[AND]] ; - %sh = lshr <2 x i8> , %x - %and = and <2 x i8> %sh, + %sh = lshr <2 x i8> , %x + %and = and <2 x i8> %sh, ret <2 x i8> %and } diff --git a/llvm/test/Transforms/InstCombine/ashr-lshr.ll b/llvm/test/Transforms/InstCombine/ashr-lshr.ll index 60fa5b2597ba9c..ac206dc7999dd2 100644 --- a/llvm/test/Transforms/InstCombine/ashr-lshr.ll +++ b/llvm/test/Transforms/InstCombine/ashr-lshr.ll @@ -229,24 +229,24 @@ define <2 x i32> @ashr_lshr_inv_nonsplat_vec(<2 x i32> %x, <2 x i32> %y) { ret <2 x i32> %ret } -define <2 x i32> @ashr_lshr_vec_undef(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @ashr_lshr_vec_undef( +define <2 x i32> @ashr_lshr_vec_poison(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @ashr_lshr_vec_poison( ; CHECK-NEXT: [[CMP12:%.*]] = ashr <2 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x i32> [[CMP12]] ; - %cmp = icmp sgt <2 x i32> %x, + %cmp = icmp sgt <2 x i32> %x, %l = lshr <2 x i32> %x, %y %r = ashr exact <2 x i32> %x, %y %ret = select <2 x i1> %cmp, <2 x i32> %l, <2 x i32> %r ret <2 x i32> %ret } -define <2 x i32> @ashr_lshr_vec_undef2(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @ashr_lshr_vec_undef2( +define <2 x i32> @ashr_lshr_vec_poison2(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @ashr_lshr_vec_poison2( ; CHECK-NEXT: [[CMP1:%.*]] = ashr exact <2 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x i32> [[CMP1]] ; - %cmp = icmp slt <2 x i32> %x, + %cmp = icmp slt <2 x i32> %x, %l = lshr exact <2 x i32> %x, %y %r = ashr exact <2 x i32> %x, %y %ret = select <2 x i1> %cmp, <2 x i32> %r, <2 x i32> %l @@ -498,14 +498,14 @@ define <3 x i42> @lshr_sub_nsw_splat(<3 x i42> %x, <3 x i42> %y) { ret <3 x i42> %shr } -define <3 x i42> @lshr_sub_nsw_splat_undef(<3 x i42> %x, <3 x i42> %y) { -; CHECK-LABEL: @lshr_sub_nsw_splat_undef( +define <3 x i42> @lshr_sub_nsw_splat_poison(<3 x i42> %x, <3 x i42> %y) { +; CHECK-LABEL: @lshr_sub_nsw_splat_poison( ; CHECK-NEXT: [[SUB:%.*]] = sub nsw <3 x i42> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[SHR:%.*]] = lshr <3 x i42> [[SUB]], +; CHECK-NEXT: [[SHR:%.*]] = lshr <3 x i42> [[SUB]], ; CHECK-NEXT: ret <3 x i42> [[SHR]] ; %sub = sub nsw <3 x i42> %x, %y - %shr = lshr <3 x i42> %sub, + %shr = lshr <3 x i42> %sub, ret <3 x i42> %shr } @@ -572,14 +572,14 @@ define <3 x i43> @ashr_sub_nsw_splat(<3 x i43> %x, <3 x i43> %y) { ret <3 x i43> %shr } -define <3 x i43> @ashr_sub_nsw_splat_undef(<3 x i43> %x, <3 x i43> %y) { -; CHECK-LABEL: @ashr_sub_nsw_splat_undef( +define <3 x i43> @ashr_sub_nsw_splat_poison(<3 x i43> %x, <3 x i43> %y) { +; CHECK-LABEL: @ashr_sub_nsw_splat_poison( ; CHECK-NEXT: [[SUB:%.*]] = sub nsw <3 x i43> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[SHR:%.*]] = ashr <3 x i43> [[SUB]], +; CHECK-NEXT: [[SHR:%.*]] = ashr <3 x i43> [[SUB]], ; CHECK-NEXT: ret <3 x i43> [[SHR]] ; %sub = sub nsw <3 x i43> %x, %y - %shr = ashr <3 x i43> %sub, + %shr = ashr <3 x i43> %sub, ret <3 x i43> %shr } diff --git a/llvm/test/Transforms/InstCombine/ashr-or-mul-abs.ll b/llvm/test/Transforms/InstCombine/ashr-or-mul-abs.ll index 3cf312e426edff..46a7f2f1189e24 100644 --- a/llvm/test/Transforms/InstCombine/ashr-or-mul-abs.ll +++ b/llvm/test/Transforms/InstCombine/ashr-or-mul-abs.ll @@ -62,13 +62,13 @@ define <4 x i32> @ashr_or_mul_to_abs_vec2(<4 x i32> %X) { ret <4 x i32> %i2 } -define <4 x i32> @ashr_or_mul_to_abs_vec3_undef(<4 x i32> %X) { -; CHECK-LABEL: @ashr_or_mul_to_abs_vec3_undef( +define <4 x i32> @ashr_or_mul_to_abs_vec3_poison(<4 x i32> %X) { +; CHECK-LABEL: @ashr_or_mul_to_abs_vec3_poison( ; CHECK-NEXT: [[I2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[X:%.*]], i1 false) ; CHECK-NEXT: ret <4 x i32> [[I2]] ; - %i = ashr <4 x i32> %X, - %i1 = or <4 x i32> %i, + %i = ashr <4 x i32> %X, + %i1 = or <4 x i32> %i, %i2 = mul <4 x i32> %i1, %X ret <4 x i32> %i2 } diff --git a/llvm/test/Transforms/InstCombine/binop-and-shifts.ll b/llvm/test/Transforms/InstCombine/binop-and-shifts.ll index 148963894b89fb..f776dc13bb4e5a 100644 --- a/llvm/test/Transforms/InstCombine/binop-and-shifts.ll +++ b/llvm/test/Transforms/InstCombine/binop-and-shifts.ll @@ -178,27 +178,27 @@ define <2 x i8> @shl_xor_and(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @shl_xor_and( ; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[Y:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[TMP1]], [[X:%.*]] -; CHECK-NEXT: [[BW1:%.*]] = shl <2 x i8> [[TMP2]], +; CHECK-NEXT: [[BW1:%.*]] = shl <2 x i8> [[TMP2]], ; CHECK-NEXT: ret <2 x i8> [[BW1]] ; - %shift1 = shl <2 x i8> %x, - %shift2 = shl <2 x i8> %y, - %bw2 = xor <2 x i8> %shift2, + %shift1 = shl <2 x i8> %x, + %shift2 = shl <2 x i8> %y, + %bw2 = xor <2 x i8> %shift2, %bw1 = and <2 x i8> %bw2, %shift1 ret <2 x i8> %bw1 } define <2 x i8> @shl_xor_and_fail(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @shl_xor_and_fail( -; CHECK-NEXT: [[SHIFT1:%.*]] = shl <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[SHIFT2:%.*]] = shl <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[BW2:%.*]] = xor <2 x i8> [[SHIFT2]], +; CHECK-NEXT: [[SHIFT1:%.*]] = shl <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[SHIFT2:%.*]] = shl <2 x i8> [[Y:%.*]], +; CHECK-NEXT: [[BW2:%.*]] = xor <2 x i8> [[SHIFT2]], ; CHECK-NEXT: [[BW1:%.*]] = and <2 x i8> [[SHIFT1]], [[BW2]] ; CHECK-NEXT: ret <2 x i8> [[BW1]] ; - %shift1 = shl <2 x i8> %x, - %shift2 = shl <2 x i8> %y, - %bw2 = xor <2 x i8> %shift2, + %shift1 = shl <2 x i8> %x, + %shift2 = shl <2 x i8> %y, + %bw2 = xor <2 x i8> %shift2, %bw1 = and <2 x i8> %shift1, %bw2 ret <2 x i8> %bw1 } @@ -321,13 +321,13 @@ define <2 x i8> @lshr_add_and(<2 x i8> %x, <2 x i8> %y) { define <2 x i8> @lshr_add_or_fail_dif_masks(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @lshr_add_or_fail_dif_masks( ; CHECK-NEXT: [[SHIFT1:%.*]] = lshr <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[SHIFT2:%.*]] = lshr <2 x i8> [[Y:%.*]], -; CHECK-NEXT: [[BW2:%.*]] = add <2 x i8> [[SHIFT2]], +; CHECK-NEXT: [[SHIFT2:%.*]] = lshr <2 x i8> [[Y:%.*]], +; CHECK-NEXT: [[BW2:%.*]] = add nsw <2 x i8> [[SHIFT2]], ; CHECK-NEXT: [[BW1:%.*]] = and <2 x i8> [[SHIFT1]], [[BW2]] ; CHECK-NEXT: ret <2 x i8> [[BW1]] ; %shift1 = lshr <2 x i8> %x, - %shift2 = lshr <2 x i8> %y, + %shift2 = lshr <2 x i8> %y, %bw2 = add <2 x i8> %shift2, %bw1 = and <2 x i8> %shift1, %bw2 ret <2 x i8> %bw1 @@ -659,8 +659,8 @@ define <4 x i8> @and_ashr_not_vec_commuted(<4 x i8> %x, <4 x i8> %y, <4 x i8> %s ret <4 x i8> %and } -define <4 x i8> @and_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { -; CHECK-LABEL: @and_ashr_not_vec_undef_1( +define <4 x i8> @and_ashr_not_vec_poison_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { +; CHECK-LABEL: @and_ashr_not_vec_poison_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i8> [[TMP1]], [[X:%.*]] ; CHECK-NEXT: [[AND:%.*]] = ashr <4 x i8> [[TMP2]], [[SHAMT:%.*]] @@ -668,18 +668,18 @@ define <4 x i8> @and_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %sh ; %x.shift = ashr <4 x i8> %x, %shamt %y.shift = ashr <4 x i8> %y, %shamt - %y.shift.not = xor <4 x i8> %y.shift, + %y.shift.not = xor <4 x i8> %y.shift, %and = and <4 x i8> %x.shift, %y.shift.not ret <4 x i8> %and } -define <4 x i8> @and_ashr_not_vec_undef_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { -; CHECK-LABEL: @and_ashr_not_vec_undef_2( -; CHECK-NEXT: ret <4 x i8> zeroinitializer +define <4 x i8> @and_ashr_not_vec_poison_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { +; CHECK-LABEL: @and_ashr_not_vec_poison_2( +; CHECK-NEXT: ret <4 x i8> poison ; %x.shift = ashr <4 x i8> %x, %shamt %y.shift = ashr <4 x i8> %y, %shamt - %y.shift.not = xor <4 x i8> %y.shift, + %y.shift.not = xor <4 x i8> %y.shift, %and = and <4 x i8> %x.shift, %y.shift.not ret <4 x i8> %and } @@ -793,8 +793,8 @@ define <4 x i8> @or_ashr_not_vec_commuted(<4 x i8> %x, <4 x i8> %y, <4 x i8> %sh ret <4 x i8> %or } -define <4 x i8> @or_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { -; CHECK-LABEL: @or_ashr_not_vec_undef_1( +define <4 x i8> @or_ashr_not_vec_poison_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { +; CHECK-LABEL: @or_ashr_not_vec_poison_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i8> [[TMP1]], [[X:%.*]] ; CHECK-NEXT: [[OR:%.*]] = ashr <4 x i8> [[TMP2]], [[SHAMT:%.*]] @@ -802,18 +802,18 @@ define <4 x i8> @or_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %sha ; %x.shift = ashr <4 x i8> %x, %shamt %y.shift = ashr <4 x i8> %y, %shamt - %y.shift.not = xor <4 x i8> %y.shift, + %y.shift.not = xor <4 x i8> %y.shift, %or = or <4 x i8> %x.shift, %y.shift.not ret <4 x i8> %or } -define <4 x i8> @or_ashr_not_vec_undef_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { -; CHECK-LABEL: @or_ashr_not_vec_undef_2( -; CHECK-NEXT: ret <4 x i8> +define <4 x i8> @or_ashr_not_vec_poison_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { +; CHECK-LABEL: @or_ashr_not_vec_poison_2( +; CHECK-NEXT: ret <4 x i8> poison ; %x.shift = ashr <4 x i8> %x, %shamt %y.shift = ashr <4 x i8> %y, %shamt - %y.shift.not = xor <4 x i8> %y.shift, + %y.shift.not = xor <4 x i8> %y.shift, %or = or <4 x i8> %x.shift, %y.shift.not ret <4 x i8> %or } @@ -926,8 +926,8 @@ define <4 x i8> @xor_ashr_not_vec_commuted(<4 x i8> %x, <4 x i8> %y, <4 x i8> %s ret <4 x i8> %xor } -define <4 x i8> @xor_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { -; CHECK-LABEL: @xor_ashr_not_vec_undef_1( +define <4 x i8> @xor_ashr_not_vec_poison_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { +; CHECK-LABEL: @xor_ashr_not_vec_poison_1( ; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: [[DOTNOT:%.*]] = ashr <4 x i8> [[TMP1]], [[SHAMT:%.*]] ; CHECK-NEXT: [[XOR:%.*]] = xor <4 x i8> [[DOTNOT]], @@ -935,18 +935,18 @@ define <4 x i8> @xor_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %sh ; %x.shift = ashr <4 x i8> %x, %shamt %y.shift = ashr <4 x i8> %y, %shamt - %y.shift.not = xor <4 x i8> %y.shift, + %y.shift.not = xor <4 x i8> %y.shift, %xor = xor <4 x i8> %x.shift, %y.shift.not ret <4 x i8> %xor } -define <4 x i8> @xor_ashr_not_vec_undef_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { -; CHECK-LABEL: @xor_ashr_not_vec_undef_2( -; CHECK-NEXT: ret <4 x i8> undef +define <4 x i8> @xor_ashr_not_vec_poison_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) { +; CHECK-LABEL: @xor_ashr_not_vec_poison_2( +; CHECK-NEXT: ret <4 x i8> poison ; %x.shift = ashr <4 x i8> %x, %shamt %y.shift = ashr <4 x i8> %y, %shamt - %y.shift.not = xor <4 x i8> %y.shift, + %y.shift.not = xor <4 x i8> %y.shift, %xor = xor <4 x i8> %x.shift, %y.shift.not ret <4 x i8> %xor } diff --git a/llvm/test/Transforms/InstCombine/binop-of-displaced-shifts.ll b/llvm/test/Transforms/InstCombine/binop-of-displaced-shifts.ll index 27a3c8743368a1..a16ad4ddb806f6 100644 --- a/llvm/test/Transforms/InstCombine/binop-of-displaced-shifts.ll +++ b/llvm/test/Transforms/InstCombine/binop-of-displaced-shifts.ll @@ -202,41 +202,41 @@ define <2 x i8> @shl_or_non_splat(<2 x i8> %x) { ret <2 x i8> %binop } -define <2 x i8> @shl_or_undef_in_add(<2 x i8> %x) { -; CHECK-LABEL: define <2 x i8> @shl_or_undef_in_add +define <2 x i8> @shl_or_poison_in_add(<2 x i8> %x) { +; CHECK-LABEL: define <2 x i8> @shl_or_poison_in_add ; CHECK-SAME: (<2 x i8> [[X:%.*]]) { ; CHECK-NEXT: [[BINOP:%.*]] = shl <2 x i8> , [[X]] ; CHECK-NEXT: ret <2 x i8> [[BINOP]] ; %shift = shl <2 x i8> , %x - %add = add <2 x i8> %x, + %add = add <2 x i8> %x, %shift2 = shl <2 x i8> , %add %binop = or <2 x i8> %shift, %shift2 ret <2 x i8> %binop } -define <2 x i8> @shl_or_undef_in_shift1(<2 x i8> %x) { -; CHECK-LABEL: define <2 x i8> @shl_or_undef_in_shift1 +define <2 x i8> @shl_or_poison_in_shift1(<2 x i8> %x) { +; CHECK-LABEL: define <2 x i8> @shl_or_poison_in_shift1 ; CHECK-SAME: (<2 x i8> [[X:%.*]]) { -; CHECK-NEXT: [[BINOP:%.*]] = shl <2 x i8> , [[X]] +; CHECK-NEXT: [[BINOP:%.*]] = shl <2 x i8> , [[X]] ; CHECK-NEXT: ret <2 x i8> [[BINOP]] ; - %shift = shl <2 x i8> , %x + %shift = shl <2 x i8> , %x %add = add <2 x i8> %x, %shift2 = shl <2 x i8> , %add %binop = or <2 x i8> %shift, %shift2 ret <2 x i8> %binop } -define <2 x i8> @shl_or_undef_in_shift2(<2 x i8> %x) { -; CHECK-LABEL: define <2 x i8> @shl_or_undef_in_shift2 +define <2 x i8> @shl_or_poison_in_shift2(<2 x i8> %x) { +; CHECK-LABEL: define <2 x i8> @shl_or_poison_in_shift2 ; CHECK-SAME: (<2 x i8> [[X:%.*]]) { -; CHECK-NEXT: [[BINOP:%.*]] = shl <2 x i8> , [[X]] +; CHECK-NEXT: [[BINOP:%.*]] = shl <2 x i8> , [[X]] ; CHECK-NEXT: ret <2 x i8> [[BINOP]] ; %shift = shl <2 x i8> , %x %add = add <2 x i8> %x, - %shift2 = shl <2 x i8> , %add + %shift2 = shl <2 x i8> , %add %binop = or <2 x i8> %shift, %shift2 ret <2 x i8> %binop } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-clamp-like-pattern-between-zero-and-positive-threshold.ll b/llvm/test/Transforms/InstCombine/canonicalize-clamp-like-pattern-between-zero-and-positive-threshold.ll index 4547008b760933..c555970ea43484 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-clamp-like-pattern-between-zero-and-positive-threshold.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-clamp-like-pattern-between-zero-and-positive-threshold.ll @@ -338,22 +338,22 @@ define <2 x i32> @t18_ult_slt_vec_nonsplat(<2 x i32> %x, <2 x i32> %replacement_ ret <2 x i32> %r } -define <3 x i32> @t19_ult_slt_vec_undef0(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) { -; CHECK-LABEL: @t19_ult_slt_vec_undef0( +define <3 x i32> @t19_ult_slt_vec_poison0(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) { +; CHECK-LABEL: @t19_ult_slt_vec_poison0( ; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <3 x i32> [[X:%.*]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <3 x i32> [[X]], ; CHECK-NEXT: [[TMP3:%.*]] = select <3 x i1> [[TMP1]], <3 x i32> [[REPLACEMENT_LOW:%.*]], <3 x i32> [[X]] ; CHECK-NEXT: [[R:%.*]] = select <3 x i1> [[TMP2]], <3 x i32> [[REPLACEMENT_HIGH:%.*]], <3 x i32> [[TMP3]] ; CHECK-NEXT: ret <3 x i32> [[R]] ; - %t0 = icmp slt <3 x i32> %x, + %t0 = icmp slt <3 x i32> %x, %t1 = select <3 x i1> %t0, <3 x i32> %replacement_low, <3 x i32> %replacement_high %t2 = icmp ult <3 x i32> %x, %r = select <3 x i1> %t2, <3 x i32> %x, <3 x i32> %t1 ret <3 x i32> %r } -define <3 x i32> @t20_ult_slt_vec_undef1(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) { -; CHECK-LABEL: @t20_ult_slt_vec_undef1( +define <3 x i32> @t20_ult_slt_vec_poison1(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) { +; CHECK-LABEL: @t20_ult_slt_vec_poison1( ; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <3 x i32> [[X:%.*]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <3 x i32> [[X]], ; CHECK-NEXT: [[TMP3:%.*]] = select <3 x i1> [[TMP1]], <3 x i32> [[REPLACEMENT_LOW:%.*]], <3 x i32> [[X]] @@ -362,21 +362,21 @@ define <3 x i32> @t20_ult_slt_vec_undef1(<3 x i32> %x, <3 x i32> %replacement_lo ; %t0 = icmp slt <3 x i32> %x, %t1 = select <3 x i1> %t0, <3 x i32> %replacement_low, <3 x i32> %replacement_high - %t2 = icmp ult <3 x i32> %x, + %t2 = icmp ult <3 x i32> %x, %r = select <3 x i1> %t2, <3 x i32> %x, <3 x i32> %t1 ret <3 x i32> %r } -define <3 x i32> @t21_ult_slt_vec_undef2(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) { -; CHECK-LABEL: @t21_ult_slt_vec_undef2( +define <3 x i32> @t21_ult_slt_vec_poison2(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) { +; CHECK-LABEL: @t21_ult_slt_vec_poison2( ; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <3 x i32> [[X:%.*]], zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <3 x i32> [[X]], ; CHECK-NEXT: [[TMP3:%.*]] = select <3 x i1> [[TMP1]], <3 x i32> [[REPLACEMENT_LOW:%.*]], <3 x i32> [[X]] ; CHECK-NEXT: [[R:%.*]] = select <3 x i1> [[TMP2]], <3 x i32> [[REPLACEMENT_HIGH:%.*]], <3 x i32> [[TMP3]] ; CHECK-NEXT: ret <3 x i32> [[R]] ; - %t0 = icmp slt <3 x i32> %x, + %t0 = icmp slt <3 x i32> %x, %t1 = select <3 x i1> %t0, <3 x i32> %replacement_low, <3 x i32> %replacement_high - %t2 = icmp ult <3 x i32> %x, + %t2 = icmp ult <3 x i32> %x, %r = select <3 x i1> %t2, <3 x i32> %x, <3 x i32> %t1 ret <3 x i32> %r } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll index 5b7a99d53c308c..759770688cf209 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll @@ -79,12 +79,12 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1(<2 x i8> %x) { ret <2 x i1> %ret } -define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) { -; CHECK-LABEL: @p3_vec_splat_undef( +define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) { +; CHECK-LABEL: @p3_vec_splat_poison( ; CHECK-NEXT: [[RET:%.*]] = icmp ult <3 x i8> [[X:%.*]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp eq <3 x i8> %tmp0, %x ret <3 x i1> %ret } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll index 160d968b9ac4c7..95e6d5ac6a5f81 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll @@ -79,22 +79,22 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1(<2 x i8> %x) { ret <2 x i1> %ret } -define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) { -; CHECK-LABEL: @p3_vec_splat_undef( +define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) { +; CHECK-LABEL: @p3_vec_splat_poison( ; CHECK-NEXT: [[RET:%.*]] = icmp ugt <3 x i8> [[X:%.*]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp ne <3 x i8> %tmp0, %x ret <3 x i1> %ret } -define <3 x i1> @p3_vec_nonsplat_undef(<3 x i8> %x) { -; CHECK-LABEL: @p3_vec_nonsplat_undef( +define <3 x i1> @p3_vec_nonsplat_poison(<3 x i8> %x) { +; CHECK-LABEL: @p3_vec_nonsplat_poison( ; CHECK-NEXT: [[RET:%.*]] = icmp ugt <3 x i8> [[X:%.*]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp ne <3 x i8> %tmp0, %x ret <3 x i1> %ret } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll index 60921042d52435..ae503bfb1cfe28 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll @@ -58,12 +58,12 @@ define <2 x i1> @p2_vec_nonsplat_edgecase(<2 x i8> %x) { ret <2 x i1> %ret } -define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) { -; CHECK-LABEL: @p3_vec_splat_undef( +define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) { +; CHECK-LABEL: @p3_vec_splat_poison( ; CHECK-NEXT: [[RET:%.*]] = icmp slt <3 x i8> [[X:%.*]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp sge <3 x i8> %tmp0, %x ret <3 x i1> %ret } @@ -175,11 +175,11 @@ define <2 x i1> @n3_vec(<2 x i8> %x) { define <3 x i1> @n4_vec(<3 x i8> %x) { ; CHECK-LABEL: @n4_vec( -; CHECK-NEXT: [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], +; CHECK-NEXT: [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], ; CHECK-NEXT: [[RET:%.*]] = icmp sge <3 x i8> [[TMP0]], [[X]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp sge <3 x i8> %tmp0, %x ret <3 x i1> %ret } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll index 6345e70d7220e2..f1333fed2c5179 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll @@ -72,26 +72,26 @@ define <2 x i1> @p2_vec_nonsplat_edgecase() { ret <2 x i1> %ret } -define <3 x i1> @p3_vec_splat_undef() { -; CHECK-LABEL: @p3_vec_splat_undef( +define <3 x i1> @p3_vec_splat_poison() { +; CHECK-LABEL: @p3_vec_splat_poison( ; CHECK-NEXT: [[X:%.*]] = call <3 x i8> @gen3x8() ; CHECK-NEXT: [[RET:%.*]] = icmp sgt <3 x i8> [[X]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %x = call <3 x i8> @gen3x8() - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp sgt <3 x i8> %x, %tmp0 ret <3 x i1> %ret } -define <3 x i1> @p3_vec_nonsplat_undef() { -; CHECK-LABEL: @p3_vec_nonsplat_undef( +define <3 x i1> @p3_vec_nonsplat_poison() { +; CHECK-LABEL: @p3_vec_nonsplat_poison( ; CHECK-NEXT: [[X:%.*]] = call <3 x i8> @gen3x8() ; CHECK-NEXT: [[RET:%.*]] = icmp sgt <3 x i8> [[X]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %x = call <3 x i8> @gen3x8() - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp sgt <3 x i8> %x, %tmp0 ret <3 x i1> %ret } @@ -212,12 +212,12 @@ define <2 x i1> @n3_vec() { define <3 x i1> @n4_vec() { ; CHECK-LABEL: @n4_vec( ; CHECK-NEXT: [[X:%.*]] = call <3 x i8> @gen3x8() -; CHECK-NEXT: [[TMP0:%.*]] = and <3 x i8> [[X]], +; CHECK-NEXT: [[TMP0:%.*]] = and <3 x i8> [[X]], ; CHECK-NEXT: [[RET:%.*]] = icmp sgt <3 x i8> [[X]], [[TMP0]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %x = call <3 x i8> @gen3x8() - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp sgt <3 x i8> %x, %tmp0 ret <3 x i1> %ret } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll index b7aec53fed6760..4bed21a525f052 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll @@ -72,14 +72,14 @@ define <2 x i1> @p2_vec_nonsplat_edgecase() { ret <2 x i1> %ret } -define <3 x i1> @p3_vec_splat_undef() { -; CHECK-LABEL: @p3_vec_splat_undef( +define <3 x i1> @p3_vec_splat_poison() { +; CHECK-LABEL: @p3_vec_splat_poison( ; CHECK-NEXT: [[X:%.*]] = call <3 x i8> @gen3x8() ; CHECK-NEXT: [[RET:%.*]] = icmp slt <3 x i8> [[X]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %x = call <3 x i8> @gen3x8() - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp sle <3 x i8> %x, %tmp0 ret <3 x i1> %ret } @@ -200,12 +200,12 @@ define <2 x i1> @n3_vec() { define <3 x i1> @n4_vec() { ; CHECK-LABEL: @n4_vec( ; CHECK-NEXT: [[X:%.*]] = call <3 x i8> @gen3x8() -; CHECK-NEXT: [[TMP0:%.*]] = and <3 x i8> [[X]], +; CHECK-NEXT: [[TMP0:%.*]] = and <3 x i8> [[X]], ; CHECK-NEXT: [[RET:%.*]] = icmp sle <3 x i8> [[X]], [[TMP0]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %x = call <3 x i8> @gen3x8() - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp sle <3 x i8> %x, %tmp0 ret <3 x i1> %ret } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll index 56661d335c4f60..be6e3d0306bcdf 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll @@ -58,22 +58,22 @@ define <2 x i1> @p2_vec_nonsplat_edgecase(<2 x i8> %x) { ret <2 x i1> %ret } -define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) { -; CHECK-LABEL: @p3_vec_splat_undef( +define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) { +; CHECK-LABEL: @p3_vec_splat_poison( ; CHECK-NEXT: [[RET:%.*]] = icmp sgt <3 x i8> [[X:%.*]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp slt <3 x i8> %tmp0, %x ret <3 x i1> %ret } -define <3 x i1> @p3_vec_nonsplat_undef(<3 x i8> %x) { -; CHECK-LABEL: @p3_vec_nonsplat_undef( +define <3 x i1> @p3_vec_nonsplat_poison(<3 x i8> %x) { +; CHECK-LABEL: @p3_vec_nonsplat_poison( ; CHECK-NEXT: [[RET:%.*]] = icmp sgt <3 x i8> [[X:%.*]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp slt <3 x i8> %tmp0, %x ret <3 x i1> %ret } @@ -185,11 +185,11 @@ define <2 x i1> @n3(<2 x i8> %x) { define <3 x i1> @n4(<3 x i8> %x) { ; CHECK-LABEL: @n4( -; CHECK-NEXT: [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], +; CHECK-NEXT: [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], ; CHECK-NEXT: [[RET:%.*]] = icmp slt <3 x i8> [[TMP0]], [[X]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp slt <3 x i8> %tmp0, %x ret <3 x i1> %ret } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll index a93e8f779435fc..cfd48821b2c1d5 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll @@ -79,12 +79,12 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1(<2 x i8> %x) { ret <2 x i1> %ret } -define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) { -; CHECK-LABEL: @p3_vec_splat_undef( +define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) { +; CHECK-LABEL: @p3_vec_splat_poison( ; CHECK-NEXT: [[RET:%.*]] = icmp ult <3 x i8> [[X:%.*]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp uge <3 x i8> %tmp0, %x ret <3 x i1> %ret } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ugt-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ugt-to-icmp-ugt.ll index 73ea4d456d2462..6f6ba95a81c767 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ugt-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ugt-to-icmp-ugt.ll @@ -95,26 +95,26 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1() { ret <2 x i1> %ret } -define <3 x i1> @p3_vec_splat_undef() { -; CHECK-LABEL: @p3_vec_splat_undef( +define <3 x i1> @p3_vec_splat_poison() { +; CHECK-LABEL: @p3_vec_splat_poison( ; CHECK-NEXT: [[X:%.*]] = call <3 x i8> @gen3x8() ; CHECK-NEXT: [[RET:%.*]] = icmp ugt <3 x i8> [[X]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %x = call <3 x i8> @gen3x8() - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp ugt <3 x i8> %x, %tmp0 ret <3 x i1> %ret } -define <3 x i1> @p3_vec_nonsplat_undef() { -; CHECK-LABEL: @p3_vec_nonsplat_undef( +define <3 x i1> @p3_vec_nonsplat_poison() { +; CHECK-LABEL: @p3_vec_nonsplat_poison( ; CHECK-NEXT: [[X:%.*]] = call <3 x i8> @gen3x8() ; CHECK-NEXT: [[RET:%.*]] = icmp ugt <3 x i8> [[X]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %x = call <3 x i8> @gen3x8() - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp ugt <3 x i8> %x, %tmp0 ret <3 x i1> %ret } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ule-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ule-to-icmp-ule.ll index 53886b5f2dc9c3..54f00321c4cf02 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ule-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ule-to-icmp-ule.ll @@ -95,14 +95,14 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1() { ret <2 x i1> %ret } -define <3 x i1> @p3_vec_splat_undef() { -; CHECK-LABEL: @p3_vec_splat_undef( +define <3 x i1> @p3_vec_splat_poison() { +; CHECK-LABEL: @p3_vec_splat_poison( ; CHECK-NEXT: [[X:%.*]] = call <3 x i8> @gen3x8() ; CHECK-NEXT: [[RET:%.*]] = icmp ult <3 x i8> [[X]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %x = call <3 x i8> @gen3x8() - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp ule <3 x i8> %x, %tmp0 ret <3 x i1> %ret } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll index d66be571008c2f..008fc6d2d6eda9 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll @@ -80,22 +80,22 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1(<2 x i8> %x) { ret <2 x i1> %ret } -define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) { -; CHECK-LABEL: @p3_vec_splat_undef( +define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) { +; CHECK-LABEL: @p3_vec_splat_poison( ; CHECK-NEXT: [[RET:%.*]] = icmp ugt <3 x i8> [[X:%.*]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp ult <3 x i8> %tmp0, %x ret <3 x i1> %ret } -define <3 x i1> @p3_vec_nonsplat_undef(<3 x i8> %x) { -; CHECK-LABEL: @p3_vec_nonsplat_undef( +define <3 x i1> @p3_vec_nonsplat_poison(<3 x i8> %x) { +; CHECK-LABEL: @p3_vec_nonsplat_poison( ; CHECK-NEXT: [[RET:%.*]] = icmp ugt <3 x i8> [[X:%.*]], ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = and <3 x i8> %x, + %tmp0 = and <3 x i8> %x, %ret = icmp ult <3 x i8> %tmp0, %x ret <3 x i1> %ret } diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll index 38611d8b53a988..dc5658d302d991 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll @@ -40,13 +40,13 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { ret <2 x i1> %ret } -define <3 x i1> @p2_vec_undef(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @p2_vec_undef( -; CHECK-NEXT: [[TMP0:%.*]] = lshr <3 x i8> , [[Y:%.*]] +define <3 x i1> @p2_vec_poison(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @p2_vec_poison( +; CHECK-NEXT: [[TMP0:%.*]] = lshr <3 x i8> , [[Y:%.*]] ; CHECK-NEXT: [[RET:%.*]] = icmp uge <3 x i8> [[TMP0]], [[X:%.*]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = lshr <3 x i8> , %y + %tmp0 = lshr <3 x i8> , %y %tmp1 = and <3 x i8> %tmp0, %x %ret = icmp eq <3 x i8> %tmp1, %x ret <3 x i1> %ret diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll index 37d317b695f60c..8fbbd2bb9907d9 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll @@ -40,13 +40,13 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { ret <2 x i1> %ret } -define <3 x i1> @p2_vec_undef(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @p2_vec_undef( -; CHECK-NEXT: [[TMP0:%.*]] = lshr <3 x i8> , [[Y:%.*]] +define <3 x i1> @p2_vec_poison(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @p2_vec_poison( +; CHECK-NEXT: [[TMP0:%.*]] = lshr <3 x i8> , [[Y:%.*]] ; CHECK-NEXT: [[RET:%.*]] = icmp ult <3 x i8> [[TMP0]], [[X:%.*]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %tmp0 = lshr <3 x i8> , %y + %tmp0 = lshr <3 x i8> , %y %tmp1 = and <3 x i8> %tmp0, %x %ret = icmp ne <3 x i8> %tmp1, %x ret <3 x i1> %ret diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll index dfd67eae8aafd4..88487b38e2c708 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll @@ -44,40 +44,40 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { ret <2 x i1> %ret } -define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @p2_vec_undef0( +define <3 x i1> @p2_vec_poison0(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @p2_vec_poison0( ; CHECK-NEXT: [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq <3 x i8> [[X_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %t0 = shl <3 x i8> , %y + %t0 = shl <3 x i8> , %y %t1 = xor <3 x i8> %t0, %t2 = and <3 x i8> %t1, %x %ret = icmp eq <3 x i8> %t2, %x ret <3 x i1> %ret } -define <3 x i1> @p3_vec_undef0(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @p3_vec_undef0( +define <3 x i1> @p3_vec_poison0(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @p3_vec_poison0( ; CHECK-NEXT: [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq <3 x i8> [[X_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %t0 = shl <3 x i8> , %y - %t1 = xor <3 x i8> %t0, + %t1 = xor <3 x i8> %t0, %t2 = and <3 x i8> %t1, %x %ret = icmp eq <3 x i8> %t2, %x ret <3 x i1> %ret } -define <3 x i1> @p4_vec_undef2(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @p4_vec_undef2( +define <3 x i1> @p4_vec_poison2(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @p4_vec_poison2( ; CHECK-NEXT: [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[RET:%.*]] = icmp eq <3 x i8> [[X_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %t0 = shl <3 x i8> , %y - %t1 = xor <3 x i8> %t0, + %t0 = shl <3 x i8> , %y + %t1 = xor <3 x i8> %t0, %t2 = and <3 x i8> %t1, %x %ret = icmp eq <3 x i8> %t2, %x ret <3 x i1> %ret diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll index 608e133ec7f73c..b717925fd644fc 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll @@ -44,40 +44,40 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { ret <2 x i1> %ret } -define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @p2_vec_undef0( +define <3 x i1> @p2_vec_poison0(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @p2_vec_poison0( ; CHECK-NEXT: [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne <3 x i8> [[X_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %t0 = shl <3 x i8> , %y + %t0 = shl <3 x i8> , %y %t1 = xor <3 x i8> %t0, %t2 = and <3 x i8> %t1, %x %ret = icmp ne <3 x i8> %t2, %x ret <3 x i1> %ret } -define <3 x i1> @p3_vec_undef0(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @p3_vec_undef0( +define <3 x i1> @p3_vec_poison0(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @p3_vec_poison0( ; CHECK-NEXT: [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne <3 x i8> [[X_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[RET]] ; %t0 = shl <3 x i8> , %y - %t1 = xor <3 x i8> %t0, + %t1 = xor <3 x i8> %t0, %t2 = and <3 x i8> %t1, %x %ret = icmp ne <3 x i8> %t2, %x ret <3 x i1> %ret } -define <3 x i1> @p4_vec_undef2(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @p4_vec_undef2( +define <3 x i1> @p4_vec_poison2(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @p4_vec_poison2( ; CHECK-NEXT: [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[RET:%.*]] = icmp ne <3 x i8> [[X_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %t0 = shl <3 x i8> , %y - %t1 = xor <3 x i8> %t0, + %t0 = shl <3 x i8> , %y + %t1 = xor <3 x i8> %t0, %t2 = and <3 x i8> %t1, %x %ret = icmp ne <3 x i8> %t2, %x ret <3 x i1> %ret diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll index d13129c1248a49..f48d284e085bcd 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll @@ -54,15 +54,15 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { ret <2 x i1> %ret } -define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @p2_vec_undef0( -; CHECK-NEXT: [[T0:%.*]] = shl <3 x i8> , [[Y:%.*]] +define <3 x i1> @p2_vec_poison0(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @p2_vec_poison0( +; CHECK-NEXT: [[T0:%.*]] = shl nsw <3 x i8> , [[Y:%.*]] ; CHECK-NEXT: call void @use3i8(<3 x i8> [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = lshr <3 x i8> , [[Y]] ; CHECK-NEXT: [[RET:%.*]] = icmp uge <3 x i8> [[T1]], [[X:%.*]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %t0 = shl <3 x i8> , %y + %t0 = shl <3 x i8> , %y call void @use3i8(<3 x i8> %t0) %t1 = lshr <3 x i8> %t0, %y %t2 = and <3 x i8> %t1, %x diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll index a1517b36d0b9d9..f4b3c67164e492 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll @@ -54,15 +54,15 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) { ret <2 x i1> %ret } -define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @p2_vec_undef0( -; CHECK-NEXT: [[T0:%.*]] = shl <3 x i8> , [[Y:%.*]] +define <3 x i1> @p2_vec_poison0(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @p2_vec_poison0( +; CHECK-NEXT: [[T0:%.*]] = shl nsw <3 x i8> , [[Y:%.*]] ; CHECK-NEXT: call void @use3i8(<3 x i8> [[T0]]) ; CHECK-NEXT: [[T1:%.*]] = lshr <3 x i8> , [[Y]] ; CHECK-NEXT: [[RET:%.*]] = icmp ult <3 x i8> [[T1]], [[X:%.*]] ; CHECK-NEXT: ret <3 x i1> [[RET]] ; - %t0 = shl <3 x i8> , %y + %t0 = shl <3 x i8> , %y call void @use3i8(<3 x i8> %t0) %t1 = lshr <3 x i8> %t0, %y %t2 = and <3 x i8> %t1, %x diff --git a/llvm/test/Transforms/InstCombine/cast-int-icmp-eq-0.ll b/llvm/test/Transforms/InstCombine/cast-int-icmp-eq-0.ll index 9b51a7649992f3..7b6d07a14a30e6 100644 --- a/llvm/test/Transforms/InstCombine/cast-int-icmp-eq-0.ll +++ b/llvm/test/Transforms/InstCombine/cast-int-icmp-eq-0.ll @@ -603,7 +603,7 @@ define i1 @i16_cast_cmp_sgt_int_m1_sitofp_half(i16 %i) { ret i1 %cmp } -; Verify that vector types and vector constants including undef elements are transformed too. +; Verify that vector types and vector constants including poison elements are transformed too. define <3 x i1> @i32_cast_cmp_ne_int_0_sitofp_double_vec(<3 x i32> %i) { ; CHECK-LABEL: @i32_cast_cmp_ne_int_0_sitofp_double_vec( @@ -616,38 +616,38 @@ define <3 x i1> @i32_cast_cmp_ne_int_0_sitofp_double_vec(<3 x i32> %i) { ret <3 x i1> %cmp } -; TODO: Can we propagate the constant vector with undef element? +; TODO: Can we propagate the constant vector with poison element? -define <3 x i1> @i32_cast_cmp_eq_int_0_sitofp_float_vec_undef(<3 x i32> %i) { -; CHECK-LABEL: @i32_cast_cmp_eq_int_0_sitofp_float_vec_undef( +define <3 x i1> @i32_cast_cmp_eq_int_0_sitofp_float_vec_poison(<3 x i32> %i) { +; CHECK-LABEL: @i32_cast_cmp_eq_int_0_sitofp_float_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq <3 x i32> [[I:%.*]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[CMP]] ; %f = sitofp <3 x i32> %i to <3 x float> %b = bitcast <3 x float> %f to <3 x i32> - %cmp = icmp eq <3 x i32> %b, + %cmp = icmp eq <3 x i32> %b, ret <3 x i1> %cmp } -define <3 x i1> @i64_cast_cmp_slt_int_1_sitofp_half_vec_undef(<3 x i64> %i) { -; CHECK-LABEL: @i64_cast_cmp_slt_int_1_sitofp_half_vec_undef( +define <3 x i1> @i64_cast_cmp_slt_int_1_sitofp_half_vec_poison(<3 x i64> %i) { +; CHECK-LABEL: @i64_cast_cmp_slt_int_1_sitofp_half_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = icmp slt <3 x i64> [[I:%.*]], ; CHECK-NEXT: ret <3 x i1> [[CMP]] ; %f = sitofp <3 x i64> %i to <3 x half> %b = bitcast <3 x half> %f to <3 x i16> - %cmp = icmp slt <3 x i16> %b, + %cmp = icmp slt <3 x i16> %b, ret <3 x i1> %cmp } -define <3 x i1> @i16_cast_cmp_sgt_int_m1_sitofp_float_vec_undef(<3 x i16> %i) { -; CHECK-LABEL: @i16_cast_cmp_sgt_int_m1_sitofp_float_vec_undef( +define <3 x i1> @i16_cast_cmp_sgt_int_m1_sitofp_float_vec_poison(<3 x i16> %i) { +; CHECK-LABEL: @i16_cast_cmp_sgt_int_m1_sitofp_float_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt <3 x i16> [[I:%.*]], ; CHECK-NEXT: ret <3 x i1> [[CMP]] ; %f = sitofp <3 x i16> %i to <3 x float> %b = bitcast <3 x float> %f to <3 x i32> - %cmp = icmp sgt <3 x i32> %b, + %cmp = icmp sgt <3 x i32> %b, ret <3 x i1> %cmp } diff --git a/llvm/test/Transforms/InstCombine/cast-unsigned-icmp-eqcmp-0.ll b/llvm/test/Transforms/InstCombine/cast-unsigned-icmp-eqcmp-0.ll index 0752576fad45f2..1565fb7c0a6a9f 100644 --- a/llvm/test/Transforms/InstCombine/cast-unsigned-icmp-eqcmp-0.ll +++ b/llvm/test/Transforms/InstCombine/cast-unsigned-icmp-eqcmp-0.ll @@ -27,14 +27,14 @@ define <2 x i1> @i32_cast_cmp_eq_int_0_uitofp_float_vec(<2 x i32> %i) { ret <2 x i1> %cmp } -define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_float_vec_undef(<3 x i32> %i) { -; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_float_vec_undef( +define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_float_vec_poison(<3 x i32> %i) { +; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_float_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq <3 x i32> [[I:%.*]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[CMP]] ; %f = uitofp <3 x i32> %i to <3 x float> %b = bitcast <3 x float> %f to <3 x i32> - %cmp = icmp eq <3 x i32> %b, + %cmp = icmp eq <3 x i32> %b, ret <3 x i1> %cmp } @@ -60,14 +60,14 @@ define <2 x i1> @i32_cast_cmp_ne_int_0_uitofp_float_vec(<2 x i32> %i) { ret <2 x i1> %cmp } -define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_float_vec_undef(<3 x i32> %i) { -; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_float_vec_undef( +define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_float_vec_poison(<3 x i32> %i) { +; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_float_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = icmp ne <3 x i32> [[I:%.*]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[CMP]] ; %f = uitofp <3 x i32> %i to <3 x float> %b = bitcast <3 x float> %f to <3 x i32> - %cmp = icmp ne <3 x i32> %b, + %cmp = icmp ne <3 x i32> %b, ret <3 x i1> %cmp } @@ -93,14 +93,14 @@ define <2 x i1> @i32_cast_cmp_eq_int_0_uitofp_double_vec(<2 x i32> %i) { ret <2 x i1> %cmp } -define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_double_vec_undef(<3 x i32> %i) { -; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_double_vec_undef( +define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_double_vec_poison(<3 x i32> %i) { +; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_double_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq <3 x i32> [[I:%.*]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[CMP]] ; %f = uitofp <3 x i32> %i to <3 x double> %b = bitcast <3 x double> %f to <3 x i64> - %cmp = icmp eq <3 x i64> %b, + %cmp = icmp eq <3 x i64> %b, ret <3 x i1> %cmp } @@ -126,14 +126,14 @@ define <2 x i1> @i32_cast_cmp_ne_int_0_uitofp_double_vec(<2 x i32> %i) { ret <2 x i1> %cmp } -define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_double_vec_undef(<3 x i32> %i) { -; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_double_vec_undef( +define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_double_vec_poison(<3 x i32> %i) { +; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_double_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = icmp ne <3 x i32> [[I:%.*]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[CMP]] ; %f = uitofp <3 x i32> %i to <3 x double> %b = bitcast <3 x double> %f to <3 x i64> - %cmp = icmp ne <3 x i64> %b, + %cmp = icmp ne <3 x i64> %b, ret <3 x i1> %cmp } @@ -159,14 +159,14 @@ define <2 x i1> @i32_cast_cmp_eq_int_0_uitofp_half_vec(<2 x i32> %i) { ret <2 x i1> %cmp } -define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_half_vec_undef(<3 x i32> %i) { -; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_half_vec_undef( +define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_half_vec_poison(<3 x i32> %i) { +; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_half_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq <3 x i32> [[I:%.*]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[CMP]] ; %f = uitofp <3 x i32> %i to <3 x half> %b = bitcast <3 x half> %f to <3 x i16> - %cmp = icmp eq <3 x i16> %b, + %cmp = icmp eq <3 x i16> %b, ret <3 x i1> %cmp } @@ -192,13 +192,13 @@ define <2 x i1> @i32_cast_cmp_ne_int_0_uitofp_half_vec(<2 x i32> %i) { ret <2 x i1> %cmp } -define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_half_vec_undef(<3 x i32> %i) { -; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_half_vec_undef( +define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_half_vec_poison(<3 x i32> %i) { +; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_half_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = icmp ne <3 x i32> [[I:%.*]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[CMP]] ; %f = uitofp <3 x i32> %i to <3 x half> %b = bitcast <3 x half> %f to <3 x i16> - %cmp = icmp ne <3 x i16> %b, + %cmp = icmp ne <3 x i16> %b, ret <3 x i1> %cmp } diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll index d9c93ba277295c..04a3e8931e62c9 100644 --- a/llvm/test/Transforms/InstCombine/cast.ll +++ b/llvm/test/Transforms/InstCombine/cast.ll @@ -508,18 +508,16 @@ define <2 x i16> @test40vec_nonuniform(<2 x i16> %a) { ret <2 x i16> %r } -define <2 x i16> @test40vec_undef(<2 x i16> %a) { -; ALL-LABEL: @test40vec_undef( -; ALL-NEXT: [[T:%.*]] = zext <2 x i16> [[A:%.*]] to <2 x i32> -; ALL-NEXT: [[T21:%.*]] = lshr <2 x i32> [[T]], -; ALL-NEXT: [[T5:%.*]] = shl <2 x i32> [[T]], -; ALL-NEXT: [[T32:%.*]] = or <2 x i32> [[T21]], [[T5]] -; ALL-NEXT: [[R:%.*]] = trunc <2 x i32> [[T32]] to <2 x i16> +define <2 x i16> @test40vec_poison(<2 x i16> %a) { +; ALL-LABEL: @test40vec_poison( +; ALL-NEXT: [[T21:%.*]] = lshr <2 x i16> [[A:%.*]], +; ALL-NEXT: [[T5:%.*]] = shl <2 x i16> [[A]], +; ALL-NEXT: [[R:%.*]] = or disjoint <2 x i16> [[T21]], [[T5]] ; ALL-NEXT: ret <2 x i16> [[R]] ; %t = zext <2 x i16> %a to <2 x i32> - %t21 = lshr <2 x i32> %t, - %t5 = shl <2 x i32> %t, + %t21 = lshr <2 x i32> %t, + %t5 = shl <2 x i32> %t, %t32 = or <2 x i32> %t21, %t5 %r = trunc <2 x i32> %t32 to <2 x i16> ret <2 x i16> %r @@ -1452,7 +1450,7 @@ define i32 @test89() { ; LE-LABEL: @test89( ; LE-NEXT: ret i32 6 ; - ret i32 bitcast (<2 x i16> to i32) + ret i32 bitcast (<2 x i16> to i32) } define <2 x i32> @test90() { @@ -1462,7 +1460,7 @@ define <2 x i32> @test90() { ; LE-LABEL: @test90( ; LE-NEXT: ret <2 x i32> ; - %t6 = bitcast <4 x half> to <2 x i32> + %t6 = bitcast <4 x half> to <2 x i32> ret <2 x i32> %t6 } @@ -1537,13 +1535,13 @@ define <2 x i8> @trunc_lshr_sext_uniform(<2 x i8> %A) { ret <2 x i8> %D } -define <2 x i8> @trunc_lshr_sext_uniform_undef(<2 x i8> %A) { -; ALL-LABEL: @trunc_lshr_sext_uniform_undef( -; ALL-NEXT: [[D:%.*]] = ashr <2 x i8> [[A:%.*]], +define <2 x i8> @trunc_lshr_sext_uniform_poison(<2 x i8> %A) { +; ALL-LABEL: @trunc_lshr_sext_uniform_poison( +; ALL-NEXT: [[D:%.*]] = ashr <2 x i8> [[A:%.*]], ; ALL-NEXT: ret <2 x i8> [[D]] ; %B = sext <2 x i8> %A to <2 x i32> - %C = lshr <2 x i32> %B, + %C = lshr <2 x i32> %B, %D = trunc <2 x i32> %C to <2 x i8> ret <2 x i8> %D } @@ -1559,13 +1557,13 @@ define <2 x i8> @trunc_lshr_sext_nonuniform(<2 x i8> %A) { ret <2 x i8> %D } -define <3 x i8> @trunc_lshr_sext_nonuniform_undef(<3 x i8> %A) { -; ALL-LABEL: @trunc_lshr_sext_nonuniform_undef( -; ALL-NEXT: [[D:%.*]] = ashr <3 x i8> [[A:%.*]], +define <3 x i8> @trunc_lshr_sext_nonuniform_poison(<3 x i8> %A) { +; ALL-LABEL: @trunc_lshr_sext_nonuniform_poison( +; ALL-NEXT: [[D:%.*]] = ashr <3 x i8> [[A:%.*]], ; ALL-NEXT: ret <3 x i8> [[D]] ; %B = sext <3 x i8> %A to <3 x i32> - %C = lshr <3 x i32> %B, + %C = lshr <3 x i32> %B, %D = trunc <3 x i32> %C to <3 x i8> ret <3 x i8> %D } @@ -2014,15 +2012,13 @@ define <2 x i8> @trunc_lshr_zext_uniform(<2 x i8> %A) { ret <2 x i8> %D } -define <2 x i8> @trunc_lshr_zext_uniform_undef(<2 x i8> %A) { -; ALL-LABEL: @trunc_lshr_zext_uniform_undef( -; ALL-NEXT: [[B:%.*]] = zext <2 x i8> [[A:%.*]] to <2 x i32> -; ALL-NEXT: [[C:%.*]] = lshr <2 x i32> [[B]], -; ALL-NEXT: [[D:%.*]] = trunc nuw <2 x i32> [[C]] to <2 x i8> +define <2 x i8> @trunc_lshr_zext_uniform_poison(<2 x i8> %A) { +; ALL-LABEL: @trunc_lshr_zext_uniform_poison( +; ALL-NEXT: [[D:%.*]] = lshr <2 x i8> [[A:%.*]], ; ALL-NEXT: ret <2 x i8> [[D]] ; %B = zext <2 x i8> %A to <2 x i32> - %C = lshr <2 x i32> %B, + %C = lshr <2 x i32> %B, %D = trunc <2 x i32> %C to <2 x i8> ret <2 x i8> %D } @@ -2038,15 +2034,13 @@ define <2 x i8> @trunc_lshr_zext_nonuniform(<2 x i8> %A) { ret <2 x i8> %D } -define <3 x i8> @trunc_lshr_zext_nonuniform_undef(<3 x i8> %A) { -; ALL-LABEL: @trunc_lshr_zext_nonuniform_undef( -; ALL-NEXT: [[B:%.*]] = zext <3 x i8> [[A:%.*]] to <3 x i32> -; ALL-NEXT: [[C:%.*]] = lshr <3 x i32> [[B]], -; ALL-NEXT: [[D:%.*]] = trunc nuw <3 x i32> [[C]] to <3 x i8> +define <3 x i8> @trunc_lshr_zext_nonuniform_poison(<3 x i8> %A) { +; ALL-LABEL: @trunc_lshr_zext_nonuniform_poison( +; ALL-NEXT: [[D:%.*]] = lshr <3 x i8> [[A:%.*]], ; ALL-NEXT: ret <3 x i8> [[D]] ; %B = zext <3 x i8> %A to <3 x i32> - %C = lshr <3 x i32> %B, + %C = lshr <3 x i32> %B, %D = trunc <3 x i32> %C to <3 x i8> ret <3 x i8> %D } diff --git a/llvm/test/Transforms/InstCombine/ctpop-cttz.ll b/llvm/test/Transforms/InstCombine/ctpop-cttz.ll index 5d27f374d8921b..70868554bdc1bb 100644 --- a/llvm/test/Transforms/InstCombine/ctpop-cttz.ll +++ b/llvm/test/Transforms/InstCombine/ctpop-cttz.ll @@ -116,14 +116,14 @@ define <2 x i32> @ctpop3v(<2 x i32> %0) { ret <2 x i32> %5 } -define <2 x i32> @ctpop3v_undef(<2 x i32> %0) { -; CHECK-LABEL: @ctpop3v_undef( +define <2 x i32> @ctpop3v_poison(<2 x i32> %0) { +; CHECK-LABEL: @ctpop3v_poison( ; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[TMP0:%.*]], i1 false), !range [[RNG0]] ; CHECK-NEXT: ret <2 x i32> [[TMP2]] ; %2 = sub <2 x i32> zeroinitializer, %0 %3 = and <2 x i32> %2, %0 - %4 = add <2 x i32> %3, + %4 = add <2 x i32> %3, %5 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %4) ret <2 x i32> %5 } diff --git a/llvm/test/Transforms/InstCombine/ctpop.ll b/llvm/test/Transforms/InstCombine/ctpop.ll index 27194724b7d83b..b3653e5071ba25 100644 --- a/llvm/test/Transforms/InstCombine/ctpop.ll +++ b/llvm/test/Transforms/InstCombine/ctpop.ll @@ -155,28 +155,27 @@ define <2 x i32> @_parity_of_not_vec(<2 x i32> %x) { ret <2 x i32> %r } -define <2 x i32> @_parity_of_not_undef(<2 x i32> %x) { -; CHECK-LABEL: @_parity_of_not_undef( +define <2 x i32> @_parity_of_not_poison(<2 x i32> %x) { +; CHECK-LABEL: @_parity_of_not_poison( ; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]]), !range [[RNG1]] ; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[TMP1]], ; CHECK-NEXT: ret <2 x i32> [[R]] ; - %neg = xor <2 x i32> %x, + %neg = xor <2 x i32> %x, %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %neg) %r = and <2 x i32> %cnt, ret <2 x i32> %r } -define <2 x i32> @_parity_of_not_undef2(<2 x i32> %x) { -; CHECK-LABEL: @_parity_of_not_undef2( -; CHECK-NEXT: [[NEG:%.*]] = xor <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[CNT:%.*]] = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[NEG]]), !range [[RNG1]] -; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[CNT]], +define <2 x i32> @_parity_of_not_poison2(<2 x i32> %x) { +; CHECK-LABEL: @_parity_of_not_poison2( +; CHECK-NEXT: [[CNT:%.*]] = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]]), !range [[RNG1]] +; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[CNT]], ; CHECK-NEXT: ret <2 x i32> [[R]] ; %neg = xor <2 x i32> %x, %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %neg) - %r = and <2 x i32> %cnt, + %r = and <2 x i32> %cnt, ret <2 x i32> %r } diff --git a/llvm/test/Transforms/InstCombine/fabs-as-int.ll b/llvm/test/Transforms/InstCombine/fabs-as-int.ll index f32c00e453f22d..4e49ff159f875d 100644 --- a/llvm/test/Transforms/InstCombine/fabs-as-int.ll +++ b/llvm/test/Transforms/InstCombine/fabs-as-int.ll @@ -137,15 +137,15 @@ define <2 x i32> @not_fabs_as_int_v2f32_nonsplat(<2 x float> %x) { ret <2 x i32> %and } -define <3 x i32> @fabs_as_int_v3f32_undef(<3 x float> %x) { -; CHECK-LABEL: define <3 x i32> @fabs_as_int_v3f32_undef +define <3 x i32> @fabs_as_int_v3f32_poison(<3 x float> %x) { +; CHECK-LABEL: define <3 x i32> @fabs_as_int_v3f32_poison ; CHECK-SAME: (<3 x float> [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = call <3 x float> @llvm.fabs.v3f32(<3 x float> [[X]]) ; CHECK-NEXT: [[AND:%.*]] = bitcast <3 x float> [[TMP1]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[AND]] ; %bc = bitcast <3 x float> %x to <3 x i32> - %and = and <3 x i32> %bc, + %and = and <3 x i32> %bc, ret <3 x i32> %and } diff --git a/llvm/test/Transforms/InstCombine/fabs.ll b/llvm/test/Transforms/InstCombine/fabs.ll index 7e380c2e4590a0..5ec65784e7a348 100644 --- a/llvm/test/Transforms/InstCombine/fabs.ll +++ b/llvm/test/Transforms/InstCombine/fabs.ll @@ -321,7 +321,7 @@ define <2 x float> @select_fcmp_nnan_ole_negzero(<2 x float> %x) { ; CHECK-NEXT: ret <2 x float> [[FABS]] ; %lezero = fcmp ole <2 x float> %x, - %negx = fsub nnan <2 x float> , %x + %negx = fsub nnan <2 x float> , %x %fabs = select <2 x i1> %lezero, <2 x float> %negx, <2 x float> %x ret <2 x float> %fabs } @@ -332,7 +332,7 @@ define <2 x float> @select_nnan_fcmp_nnan_ole_negzero(<2 x float> %x) { ; CHECK-NEXT: ret <2 x float> [[FABS]] ; %lezero = fcmp ole <2 x float> %x, - %negx = fsub nnan <2 x float> , %x + %negx = fsub nnan <2 x float> , %x %fabs = select nnan <2 x i1> %lezero, <2 x float> %negx, <2 x float> %x ret <2 x float> %fabs } diff --git a/llvm/test/Transforms/InstCombine/fast-math.ll b/llvm/test/Transforms/InstCombine/fast-math.ll index 916955e34efacb..83f2091244e523 100644 --- a/llvm/test/Transforms/InstCombine/fast-math.ll +++ b/llvm/test/Transforms/InstCombine/fast-math.ll @@ -541,12 +541,12 @@ define float @fneg2(float %x) { ret float %sub } -define <2 x float> @fneg2_vec_undef(<2 x float> %x) { -; CHECK-LABEL: @fneg2_vec_undef( +define <2 x float> @fneg2_vec_poison(<2 x float> %x) { +; CHECK-LABEL: @fneg2_vec_poison( ; CHECK-NEXT: [[SUB:%.*]] = fneg nsz <2 x float> [[X:%.*]] ; CHECK-NEXT: ret <2 x float> [[SUB]] ; - %sub = fsub nsz <2 x float> , %x + %sub = fsub nsz <2 x float> , %x ret <2 x float> %sub } diff --git a/llvm/test/Transforms/InstCombine/fcmp-special.ll b/llvm/test/Transforms/InstCombine/fcmp-special.ll index 88bfe930ffdd63..64bc86f4266c78 100644 --- a/llvm/test/Transforms/InstCombine/fcmp-special.ll +++ b/llvm/test/Transforms/InstCombine/fcmp-special.ll @@ -144,21 +144,21 @@ define <2 x i1> @uno_vec_with_nan(<2 x double> %x) { ret <2 x i1> %f } -define <2 x i1> @uno_vec_with_undef(<2 x double> %x) { -; CHECK-LABEL: @uno_vec_with_undef( +define <2 x i1> @uno_vec_with_poison(<2 x double> %x) { +; CHECK-LABEL: @uno_vec_with_poison( ; CHECK-NEXT: [[F:%.*]] = fcmp uno <2 x double> [[X:%.*]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[F]] ; - %f = fcmp uno <2 x double> %x, + %f = fcmp uno <2 x double> %x, ret <2 x i1> %f } -define <2 x i1> @ord_vec_with_undef(<2 x double> %x) { -; CHECK-LABEL: @ord_vec_with_undef( -; CHECK-NEXT: [[F:%.*]] = fcmp ord <2 x double> [[X:%.*]], +define <2 x i1> @ord_vec_with_poison(<2 x double> %x) { +; CHECK-LABEL: @ord_vec_with_poison( +; CHECK-NEXT: [[F:%.*]] = fcmp ord <2 x double> [[X:%.*]], ; CHECK-NEXT: ret <2 x i1> [[F]] ; - %f = fcmp ord <2 x double> %x, + %f = fcmp ord <2 x double> %x, ret <2 x i1> %f } @@ -224,12 +224,12 @@ define <2 x i1> @negative_zero_olt_vec(<2 x float> %x) { ret <2 x i1> %r } -define <2 x i1> @negative_zero_une_vec_undef(<2 x double> %x) { -; CHECK-LABEL: @negative_zero_une_vec_undef( +define <2 x i1> @negative_zero_une_vec_poison(<2 x double> %x) { +; CHECK-LABEL: @negative_zero_une_vec_poison( ; CHECK-NEXT: [[R:%.*]] = fcmp nnan une <2 x double> [[X:%.*]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %r = fcmp nnan une <2 x double> %x, + %r = fcmp nnan une <2 x double> %x, ret <2 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll index 069512b0f2d8eb..389264e2f70759 100644 --- a/llvm/test/Transforms/InstCombine/fcmp.ll +++ b/llvm/test/Transforms/InstCombine/fcmp.ll @@ -102,12 +102,12 @@ define <2 x i1> @unary_fneg_constant_swap_pred_vec(<2 x float> %x) { ret <2 x i1> %cmp } -define <2 x i1> @fneg_constant_swap_pred_vec_undef(<2 x float> %x) { -; CHECK-LABEL: @fneg_constant_swap_pred_vec_undef( +define <2 x i1> @fneg_constant_swap_pred_vec_poison(<2 x float> %x) { +; CHECK-LABEL: @fneg_constant_swap_pred_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = fcmp olt <2 x float> [[X:%.*]], ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; - %neg = fsub <2 x float> , %x + %neg = fsub <2 x float> , %x %cmp = fcmp ogt <2 x float> %neg, ret <2 x i1> %cmp } @@ -234,34 +234,34 @@ define <2 x i1> @fneg_unary_fneg_swap_pred_vec(<2 x float> %x, <2 x float> %y) { ret <2 x i1> %cmp } -define <2 x i1> @fneg_fneg_swap_pred_vec_undef(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @fneg_fneg_swap_pred_vec_undef( +define <2 x i1> @fneg_fneg_swap_pred_vec_poison(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @fneg_fneg_swap_pred_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt <2 x float> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; - %neg1 = fsub <2 x float> , %x - %neg2 = fsub <2 x float> , %y + %neg1 = fsub <2 x float> , %x + %neg2 = fsub <2 x float> , %y %cmp = fcmp olt <2 x float> %neg1, %neg2 ret <2 x i1> %cmp } -define <2 x i1> @unary_fneg_fneg_swap_pred_vec_undef(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @unary_fneg_fneg_swap_pred_vec_undef( +define <2 x i1> @unary_fneg_fneg_swap_pred_vec_poison(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @unary_fneg_fneg_swap_pred_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt <2 x float> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %neg1 = fneg <2 x float> %x - %neg2 = fsub <2 x float> , %y + %neg2 = fsub <2 x float> , %y %cmp = fcmp olt <2 x float> %neg1, %neg2 ret <2 x i1> %cmp } -define <2 x i1> @fneg_unary_fneg_swap_pred_vec_undef(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @fneg_unary_fneg_swap_pred_vec_undef( +define <2 x i1> @fneg_unary_fneg_swap_pred_vec_poison(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @fneg_unary_fneg_swap_pred_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt <2 x float> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; - %neg1 = fsub <2 x float> , %x + %neg1 = fsub <2 x float> , %x %neg2 = fneg <2 x float> %y %cmp = fcmp olt <2 x float> %neg1, %neg2 ret <2 x i1> %cmp diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll index a0710c2bb04847..ca11685c98417a 100644 --- a/llvm/test/Transforms/InstCombine/fdiv.ll +++ b/llvm/test/Transforms/InstCombine/fdiv.ll @@ -141,12 +141,12 @@ define <2 x float> @not_exact_inverse_vec_arcp(<2 x float> %x) { ret <2 x float> %div } -define <2 x float> @not_exact_inverse_vec_arcp_with_undef_elt(<2 x float> %x) { -; CHECK-LABEL: @not_exact_inverse_vec_arcp_with_undef_elt( -; CHECK-NEXT: [[DIV:%.*]] = fdiv arcp <2 x float> [[X:%.*]], +define <2 x float> @not_exact_inverse_vec_arcp_with_poison_elt(<2 x float> %x) { +; CHECK-LABEL: @not_exact_inverse_vec_arcp_with_poison_elt( +; CHECK-NEXT: [[DIV:%.*]] = fdiv arcp <2 x float> [[X:%.*]], ; CHECK-NEXT: ret <2 x float> [[DIV]] ; - %div = fdiv arcp <2 x float> %x, + %div = fdiv arcp <2 x float> %x, ret <2 x float> %div } @@ -333,13 +333,13 @@ define <2 x float> @unary_fneg_fneg_vec(<2 x float> %x, <2 x float> %y) { ret <2 x float> %div } -define <2 x float> @fneg_fneg_vec_undef_elts(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @fneg_fneg_vec_undef_elts( +define <2 x float> @fneg_fneg_vec_poison_elts(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @fneg_fneg_vec_poison_elts( ; CHECK-NEXT: [[DIV:%.*]] = fdiv <2 x float> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x float> [[DIV]] ; - %xneg = fsub <2 x float> , %x - %yneg = fsub <2 x float> , %y + %xneg = fsub <2 x float> , %x + %yneg = fsub <2 x float> , %y %div = fdiv <2 x float> %xneg, %yneg ret <2 x float> %div } @@ -404,12 +404,12 @@ define <2 x float> @unary_fneg_dividend_constant_divisor_vec(<2 x float> %x) { ret <2 x float> %div } -define <2 x float> @fneg_dividend_constant_divisor_vec_undef_elt(<2 x float> %x) { -; CHECK-LABEL: @fneg_dividend_constant_divisor_vec_undef_elt( +define <2 x float> @fneg_dividend_constant_divisor_vec_poison_elt(<2 x float> %x) { +; CHECK-LABEL: @fneg_dividend_constant_divisor_vec_poison_elt( ; CHECK-NEXT: [[DIV:%.*]] = fdiv ninf <2 x float> [[X:%.*]], ; CHECK-NEXT: ret <2 x float> [[DIV]] ; - %neg = fsub <2 x float> , %x + %neg = fsub <2 x float> , %x %div = fdiv ninf <2 x float> %neg, ret <2 x float> %div } diff --git a/llvm/test/Transforms/InstCombine/fma.ll b/llvm/test/Transforms/InstCombine/fma.ll index 8b413ae6f664b0..cf3d7f3c525a5f 100644 --- a/llvm/test/Transforms/InstCombine/fma.ll +++ b/llvm/test/Transforms/InstCombine/fma.ll @@ -60,13 +60,13 @@ define <2 x float> @fma_unary_fneg_x_unary_fneg_y_vec(<2 x float> %x, <2 x float ret <2 x float> %fma } -define <2 x float> @fma_fneg_x_fneg_y_vec_undef(<2 x float> %x, <2 x float> %y, <2 x float> %z) { -; CHECK-LABEL: @fma_fneg_x_fneg_y_vec_undef( +define <2 x float> @fma_fneg_x_fneg_y_vec_poison(<2 x float> %x, <2 x float> %y, <2 x float> %z) { +; CHECK-LABEL: @fma_fneg_x_fneg_y_vec_poison( ; CHECK-NEXT: [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[Z:%.*]]) ; CHECK-NEXT: ret <2 x float> [[FMA]] ; - %xn = fsub <2 x float> , %x - %yn = fsub <2 x float> , %y + %xn = fsub <2 x float> , %x + %yn = fsub <2 x float> , %y %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %xn, <2 x float> %yn, <2 x float> %z) ret <2 x float> %fma } diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll index 39f9e74f899d18..e9c86a1270493e 100644 --- a/llvm/test/Transforms/InstCombine/fmul.ll +++ b/llvm/test/Transforms/InstCombine/fmul.ll @@ -42,12 +42,12 @@ define <2 x float> @unary_neg_constant_vec(<2 x float> %x) { ret <2 x float> %mul } -define <2 x float> @neg_constant_vec_undef(<2 x float> %x) { -; CHECK-LABEL: @neg_constant_vec_undef( +define <2 x float> @neg_constant_vec_poison(<2 x float> %x) { +; CHECK-LABEL: @neg_constant_vec_poison( ; CHECK-NEXT: [[MUL:%.*]] = fmul ninf <2 x float> [[X:%.*]], ; CHECK-NEXT: ret <2 x float> [[MUL]] ; - %sub = fsub <2 x float> , %x + %sub = fsub <2 x float> , %x %mul = fmul ninf <2 x float> %sub, ret <2 x float> %mul } @@ -162,34 +162,34 @@ define <2 x float> @neg_unary_neg_vec(<2 x float> %x, <2 x float> %y) { ret <2 x float> %mul } -define <2 x float> @neg_neg_vec_undef(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @neg_neg_vec_undef( +define <2 x float> @neg_neg_vec_poison(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @neg_neg_vec_poison( ; CHECK-NEXT: [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x float> [[MUL]] ; - %sub1 = fsub <2 x float> , %x - %sub2 = fsub <2 x float> , %y + %sub1 = fsub <2 x float> , %x + %sub2 = fsub <2 x float> , %y %mul = fmul arcp <2 x float> %sub1, %sub2 ret <2 x float> %mul } -define <2 x float> @unary_neg_neg_vec_undef(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @unary_neg_neg_vec_undef( +define <2 x float> @unary_neg_neg_vec_poison(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @unary_neg_neg_vec_poison( ; CHECK-NEXT: [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x float> [[MUL]] ; %neg = fneg <2 x float> %x - %sub = fsub <2 x float> , %y + %sub = fsub <2 x float> , %y %mul = fmul arcp <2 x float> %neg, %sub ret <2 x float> %mul } -define <2 x float> @neg_unary_neg_vec_undef(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @neg_unary_neg_vec_undef( +define <2 x float> @neg_unary_neg_vec_poison(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @neg_unary_neg_vec_poison( ; CHECK-NEXT: [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x float> [[MUL]] ; - %sub = fsub <2 x float> , %x + %sub = fsub <2 x float> , %x %neg = fneg <2 x float> %y %mul = fmul arcp <2 x float> %sub, %neg ret <2 x float> %mul @@ -322,13 +322,13 @@ define <2 x float> @unary_neg_mul_vec(<2 x float> %x, <2 x float> %y) { ret <2 x float> %mul } -define <2 x float> @neg_mul_vec_undef(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @neg_mul_vec_undef( +define <2 x float> @neg_mul_vec_poison(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @neg_mul_vec_poison( ; CHECK-NEXT: [[SUB:%.*]] = fneg <2 x float> [[X:%.*]] ; CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[SUB]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x float> [[MUL]] ; - %sub = fsub <2 x float> , %x + %sub = fsub <2 x float> , %x %mul = fmul <2 x float> %sub, %y ret <2 x float> %mul } @@ -388,9 +388,9 @@ define void @test8(ptr %inout, i1 %c1) { entry: %0 = load i32, ptr %inout, align 4 %conv = uitofp i32 %0 to float - %vecinit = insertelement <4 x float> , float %conv, i32 3 + %vecinit = insertelement <4 x float> , float %conv, i32 3 %sub = fsub <4 x float> , %vecinit - %1 = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> + %1 = shufflevector <4 x float> %sub, <4 x float> poison, <4 x i32> %mul = fmul <4 x float> zeroinitializer, %1 br label %for.cond @@ -742,7 +742,7 @@ define <4 x float> @fdiv_constant_denominator_fmul_vec_constexpr(<4 x float> %x) ; CHECK-NEXT: [[T3:%.*]] = fmul reassoc <4 x float> [[X:%.*]], ; CHECK-NEXT: ret <4 x float> [[T3]] ; - %constExprMul = bitcast i128 trunc (i160 bitcast (<5 x float> to i160) to i128) to <4 x float> + %constExprMul = bitcast i128 trunc (i160 bitcast (<5 x float> to i160) to i128) to <4 x float> %t1 = fdiv reassoc <4 x float> %x, %t3 = fmul reassoc <4 x float> %t1, %constExprMul ret <4 x float> %t3 @@ -1270,7 +1270,7 @@ define @mul_scalable_splat_zero( %z) { ; CHECK-LABEL: @mul_scalable_splat_zero( ; CHECK-NEXT: ret zeroinitializer ; - %shuf = shufflevector insertelement ( undef, float 0.0, i32 0), undef, zeroinitializer + %shuf = shufflevector insertelement ( poison, float 0.0, i32 0), poison, zeroinitializer %t3 = fmul fast %shuf, %z ret %t3 } @@ -1393,7 +1393,7 @@ define <3 x float> @mul_neg_zero_nnan_ninf_vec(<3 x float> nofpclass(inf nan) %a ; CHECK-NEXT: ret <3 x float> [[RET]] ; entry: - %ret = fmul <3 x float> %a, + %ret = fmul <3 x float> %a, ret <3 x float> %ret } diff --git a/llvm/test/Transforms/InstCombine/fneg-as-int.ll b/llvm/test/Transforms/InstCombine/fneg-as-int.ll index d28e599cacf364..e3067b0d024614 100644 --- a/llvm/test/Transforms/InstCombine/fneg-as-int.ll +++ b/llvm/test/Transforms/InstCombine/fneg-as-int.ll @@ -139,15 +139,15 @@ define <2 x i32> @not_fneg_as_int_v2f32_nonsplat(<2 x float> %x) { ret <2 x i32> %xor } -define <3 x i32> @fneg_as_int_v3f32_undef(<3 x float> %x) { -; CHECK-LABEL: define <3 x i32> @fneg_as_int_v3f32_undef +define <3 x i32> @fneg_as_int_v3f32_poison(<3 x float> %x) { +; CHECK-LABEL: define <3 x i32> @fneg_as_int_v3f32_poison ; CHECK-SAME: (<3 x float> [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = fneg <3 x float> [[X]] ; CHECK-NEXT: [[XOR:%.*]] = bitcast <3 x float> [[TMP1]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[XOR]] ; %bc = bitcast <3 x float> %x to <3 x i32> - %xor = xor <3 x i32> %bc, + %xor = xor <3 x i32> %bc, ret <3 x i32> %xor } diff --git a/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll b/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll index 9aa8d4944e39ab..8c3e6958fe083e 100644 --- a/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll +++ b/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll @@ -158,8 +158,8 @@ define <2 x i32> @not_fneg_fabs_as_int_v2f32_nonsplat(<2 x float> %x) { ret <2 x i32> %or } -define <3 x i32> @fneg_fabs_as_int_v3f32_undef(<3 x float> %x) { -; CHECK-LABEL: define <3 x i32> @fneg_fabs_as_int_v3f32_undef +define <3 x i32> @fneg_fabs_as_int_v3f32_poison(<3 x float> %x) { +; CHECK-LABEL: define <3 x i32> @fneg_fabs_as_int_v3f32_poison ; CHECK-SAME: (<3 x float> [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = call <3 x float> @llvm.fabs.v3f32(<3 x float> [[X]]) ; CHECK-NEXT: [[TMP2:%.*]] = fneg <3 x float> [[TMP1]] @@ -167,7 +167,7 @@ define <3 x i32> @fneg_fabs_as_int_v3f32_undef(<3 x float> %x) { ; CHECK-NEXT: ret <3 x i32> [[OR]] ; %bc = bitcast <3 x float> %x to <3 x i32> - %or = or <3 x i32> %bc, + %or = or <3 x i32> %bc, ret <3 x i32> %or } diff --git a/llvm/test/Transforms/InstCombine/fneg.ll b/llvm/test/Transforms/InstCombine/fneg.ll index ed68ba50d36ee9..7c9289c447113f 100644 --- a/llvm/test/Transforms/InstCombine/fneg.ll +++ b/llvm/test/Transforms/InstCombine/fneg.ll @@ -87,24 +87,24 @@ define float @fmul_fneg_extra_use(float %x) { ret float %r } -; Try a vector. Use special constants (NaN, INF, undef) because they don't change anything. +; Try a vector. Use special constants (NaN, INF, poison) because they don't change anything. define <4 x double> @fmul_fsub_vec(<4 x double> %x) { ; CHECK-LABEL: @fmul_fsub_vec( -; CHECK-NEXT: [[R:%.*]] = fmul <4 x double> [[X:%.*]], +; CHECK-NEXT: [[R:%.*]] = fmul <4 x double> [[X:%.*]], ; CHECK-NEXT: ret <4 x double> [[R]] ; - %m = fmul <4 x double> %x, + %m = fmul <4 x double> %x, %r = fsub <4 x double> , %m ret <4 x double> %r } define <4 x double> @fmul_fneg_vec(<4 x double> %x) { ; CHECK-LABEL: @fmul_fneg_vec( -; CHECK-NEXT: [[R:%.*]] = fmul <4 x double> [[X:%.*]], +; CHECK-NEXT: [[R:%.*]] = fmul <4 x double> [[X:%.*]], ; CHECK-NEXT: ret <4 x double> [[R]] ; - %m = fmul <4 x double> %x, + %m = fmul <4 x double> %x, %r = fneg <4 x double> %m ret <4 x double> %r } @@ -181,24 +181,24 @@ define float @fdiv_op1_constant_fneg_extra_use(float %x) { ret float %r } -; Try a vector. Use special constants (NaN, INF, undef) because they don't change anything. +; Try a vector. Use special constants (NaN, INF, poison) because they don't change anything. define <4 x double> @fdiv_op1_constant_fsub_vec(<4 x double> %x) { ; CHECK-LABEL: @fdiv_op1_constant_fsub_vec( -; CHECK-NEXT: [[R:%.*]] = fdiv <4 x double> [[X:%.*]], +; CHECK-NEXT: [[R:%.*]] = fdiv <4 x double> [[X:%.*]], ; CHECK-NEXT: ret <4 x double> [[R]] ; - %d = fdiv <4 x double> %x, + %d = fdiv <4 x double> %x, %r = fsub <4 x double> , %d ret <4 x double> %r } define <4 x double> @fdiv_op1_constant_fneg_vec(<4 x double> %x) { ; CHECK-LABEL: @fdiv_op1_constant_fneg_vec( -; CHECK-NEXT: [[R:%.*]] = fdiv <4 x double> [[X:%.*]], +; CHECK-NEXT: [[R:%.*]] = fdiv <4 x double> [[X:%.*]], ; CHECK-NEXT: ret <4 x double> [[R]] ; - %d = fdiv <4 x double> %x, + %d = fdiv <4 x double> %x, %r = fneg <4 x double> %d ret <4 x double> %r } @@ -335,24 +335,24 @@ define float @fdiv_op0_constant_fneg_extra_use(float %x) { ret float %r } -; Try a vector. Use special constants (NaN, INF, undef) because they don't change anything. +; Try a vector. Use special constants (NaN, INF, poison) because they don't change anything. define <4 x double> @fdiv_op0_constant_fsub_vec(<4 x double> %x) { ; CHECK-LABEL: @fdiv_op0_constant_fsub_vec( -; CHECK-NEXT: [[R:%.*]] = fdiv <4 x double> , [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = fdiv <4 x double> , [[X:%.*]] ; CHECK-NEXT: ret <4 x double> [[R]] ; - %d = fdiv <4 x double> , %x + %d = fdiv <4 x double> , %x %r = fsub <4 x double> , %d ret <4 x double> %r } define <4 x double> @fdiv_op0_constant_fneg_vec(<4 x double> %x) { ; CHECK-LABEL: @fdiv_op0_constant_fneg_vec( -; CHECK-NEXT: [[R:%.*]] = fdiv <4 x double> , [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = fdiv <4 x double> , [[X:%.*]] ; CHECK-NEXT: ret <4 x double> [[R]] ; - %d = fdiv <4 x double> , %x + %d = fdiv <4 x double> , %x %r = fneg <4 x double> %d ret <4 x double> %r } @@ -584,11 +584,11 @@ define <2 x float> @fneg_nsz_fadd_constant_vec(<2 x float> %x) { define <2 x float> @fake_fneg_nsz_fadd_constant_vec(<2 x float> %x) { ; CHECK-LABEL: @fake_fneg_nsz_fadd_constant_vec( -; CHECK-NEXT: [[R:%.*]] = fsub nsz <2 x float> , [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = fsub nsz <2 x float> , [[X:%.*]] ; CHECK-NEXT: ret <2 x float> [[R]] ; - %a = fadd <2 x float> %x, - %r = fsub nsz <2 x float> , %a + %a = fadd <2 x float> %x, + %r = fsub nsz <2 x float> , %a ret <2 x float> %r } diff --git a/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll b/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll index b482cfdfde197b..1fd570bf2635b2 100644 --- a/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll +++ b/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll @@ -36,36 +36,36 @@ define <4 x i32> @t1_vec_splat(<4 x i32> %x, <4 x i32> %y) { ret <4 x i32> %t2 } -define <4 x i32> @t2_vec_undef0(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @t2_vec_undef0( +define <4 x i32> @t2_vec_poison0(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @t2_vec_poison0( ; CHECK-NEXT: [[T2:%.*]] = sub <4 x i32> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret <4 x i32> [[T2]] ; - %t0 = xor <4 x i32> %x, + %t0 = xor <4 x i32> %x, %t1 = add <4 x i32> %t0, %y %t2 = add <4 x i32> %t1, ret <4 x i32> %t2 } -define <4 x i32> @t3_vec_undef1(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @t3_vec_undef1( +define <4 x i32> @t3_vec_poison1(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @t3_vec_poison1( ; CHECK-NEXT: [[T2:%.*]] = sub <4 x i32> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret <4 x i32> [[T2]] ; %t0 = xor <4 x i32> %x, %t1 = add <4 x i32> %t0, %y - %t2 = add <4 x i32> %t1, + %t2 = add <4 x i32> %t1, ret <4 x i32> %t2 } -define <4 x i32> @t4_vec_undef2(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @t4_vec_undef2( +define <4 x i32> @t4_vec_poison2(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @t4_vec_poison2( ; CHECK-NEXT: [[T2:%.*]] = sub <4 x i32> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret <4 x i32> [[T2]] ; - %t0 = xor <4 x i32> %x, + %t0 = xor <4 x i32> %x, %t1 = add <4 x i32> %t0, %y - %t2 = add <4 x i32> %t1, + %t2 = add <4 x i32> %t1, ret <4 x i32> %t2 } diff --git a/llvm/test/Transforms/InstCombine/fold-sub-of-not-to-inc-of-add.ll b/llvm/test/Transforms/InstCombine/fold-sub-of-not-to-inc-of-add.ll index 6f311f05fb0176..af580ba57513c2 100644 --- a/llvm/test/Transforms/InstCombine/fold-sub-of-not-to-inc-of-add.ll +++ b/llvm/test/Transforms/InstCombine/fold-sub-of-not-to-inc-of-add.ll @@ -50,13 +50,13 @@ define <4 x i32> @p1_vector_splat(<4 x i32> %x, <4 x i32> %y) { ret <4 x i32> %t1 } -define <4 x i32> @p2_vector_undef(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @p2_vector_undef( +define <4 x i32> @p2_vector_poison(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @p2_vector_poison( ; CHECK-NEXT: [[T0_NEG:%.*]] = add <4 x i32> [[X:%.*]], ; CHECK-NEXT: [[T1:%.*]] = add <4 x i32> [[T0_NEG]], [[Y:%.*]] ; CHECK-NEXT: ret <4 x i32> [[T1]] ; - %t0 = xor <4 x i32> %x, + %t0 = xor <4 x i32> %x, %t1 = sub <4 x i32> %y, %t0 ret <4 x i32> %t1 } diff --git a/llvm/test/Transforms/InstCombine/fpcast.ll b/llvm/test/Transforms/InstCombine/fpcast.ll index d2c932ba447e4e..69daac773a6455 100644 --- a/llvm/test/Transforms/InstCombine/fpcast.ll +++ b/llvm/test/Transforms/InstCombine/fpcast.ll @@ -51,13 +51,13 @@ define half @unary_fneg_fptrunc(float %a) { ret half %c } -define <2 x half> @fneg_fptrunc_vec_undef(<2 x float> %a) { -; CHECK-LABEL: @fneg_fptrunc_vec_undef( +define <2 x half> @fneg_fptrunc_vec_poison(<2 x float> %a) { +; CHECK-LABEL: @fneg_fptrunc_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = fptrunc <2 x float> [[A:%.*]] to <2 x half> ; CHECK-NEXT: [[C:%.*]] = fneg <2 x half> [[TMP1]] ; CHECK-NEXT: ret <2 x half> [[C]] ; - %b = fsub <2 x float> , %a + %b = fsub <2 x float> , %a %c = fptrunc <2 x float> %b to <2 x half> ret <2 x half> %c } diff --git a/llvm/test/Transforms/InstCombine/fsub.ll b/llvm/test/Transforms/InstCombine/fsub.ll index 6e13c33b126d59..f1e7086e697e86 100644 --- a/llvm/test/Transforms/InstCombine/fsub.ll +++ b/llvm/test/Transforms/InstCombine/fsub.ll @@ -153,12 +153,12 @@ define <2 x float> @constant_op1_vec(<2 x float> %x, <2 x float> %y) { ret <2 x float> %r } -define <2 x float> @constant_op1_vec_undef(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @constant_op1_vec_undef( -; CHECK-NEXT: [[R:%.*]] = fadd <2 x float> [[X:%.*]], +define <2 x float> @constant_op1_vec_poison(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @constant_op1_vec_poison( +; CHECK-NEXT: [[R:%.*]] = fadd <2 x float> [[X:%.*]], ; CHECK-NEXT: ret <2 x float> [[R]] ; - %r = fsub <2 x float> %x, + %r = fsub <2 x float> %x, ret <2 x float> %r } @@ -204,12 +204,12 @@ define <2 x float> @unary_neg_op1_vec(<2 x float> %x, <2 x float> %y) { ret <2 x float> %r } -define <2 x float> @neg_op1_vec_undef(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @neg_op1_vec_undef( +define <2 x float> @neg_op1_vec_poison(<2 x float> %x, <2 x float> %y) { +; CHECK-LABEL: @neg_op1_vec_poison( ; CHECK-NEXT: [[R:%.*]] = fadd <2 x float> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x float> [[R]] ; - %negy = fsub <2 x float> , %y + %negy = fsub <2 x float> , %y %r = fsub <2 x float> %x, %negy ret <2 x float> %r } diff --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll index 162519e648f3e4..a54e6e4642b753 100644 --- a/llvm/test/Transforms/InstCombine/funnel.ll +++ b/llvm/test/Transforms/InstCombine/funnel.ll @@ -43,24 +43,24 @@ define <2 x i16> @fshl_v2i16_constant_splat(<2 x i16> %x, <2 x i16> %y) { ret <2 x i16> %r } -define <2 x i16> @fshl_v2i16_constant_splat_undef0(<2 x i16> %x, <2 x i16> %y) { -; CHECK-LABEL: @fshl_v2i16_constant_splat_undef0( +define <2 x i16> @fshl_v2i16_constant_splat_poison0(<2 x i16> %x, <2 x i16> %y) { +; CHECK-LABEL: @fshl_v2i16_constant_splat_poison0( ; CHECK-NEXT: [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[Y:%.*]], <2 x i16> ) ; CHECK-NEXT: ret <2 x i16> [[R]] ; - %shl = shl <2 x i16> %x, + %shl = shl <2 x i16> %x, %shr = lshr <2 x i16> %y, %r = or <2 x i16> %shl, %shr ret <2 x i16> %r } -define <2 x i16> @fshl_v2i16_constant_splat_undef1(<2 x i16> %x, <2 x i16> %y) { -; CHECK-LABEL: @fshl_v2i16_constant_splat_undef1( +define <2 x i16> @fshl_v2i16_constant_splat_poison1(<2 x i16> %x, <2 x i16> %y) { +; CHECK-LABEL: @fshl_v2i16_constant_splat_poison1( ; CHECK-NEXT: [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[Y:%.*]], <2 x i16> ) ; CHECK-NEXT: ret <2 x i16> [[R]] ; %shl = shl <2 x i16> %x, - %shr = lshr <2 x i16> %y, + %shr = lshr <2 x i16> %y, %r = or <2 x i16> %shl, %shr ret <2 x i16> %r } @@ -78,30 +78,30 @@ define <2 x i17> @fshr_v2i17_constant_splat(<2 x i17> %x, <2 x i17> %y) { ret <2 x i17> %r } -define <2 x i17> @fshr_v2i17_constant_splat_undef0(<2 x i17> %x, <2 x i17> %y) { -; CHECK-LABEL: @fshr_v2i17_constant_splat_undef0( +define <2 x i17> @fshr_v2i17_constant_splat_poison0(<2 x i17> %x, <2 x i17> %y) { +; CHECK-LABEL: @fshr_v2i17_constant_splat_poison0( ; CHECK-NEXT: [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[Y:%.*]], <2 x i17> [[X:%.*]], <2 x i17> ) ; CHECK-NEXT: ret <2 x i17> [[R]] ; - %shr = lshr <2 x i17> %x, - %shl = shl <2 x i17> %y, + %shr = lshr <2 x i17> %x, + %shl = shl <2 x i17> %y, %r = or <2 x i17> %shr, %shl ret <2 x i17> %r } -define <2 x i17> @fshr_v2i17_constant_splat_undef1(<2 x i17> %x, <2 x i17> %y) { -; CHECK-LABEL: @fshr_v2i17_constant_splat_undef1( +define <2 x i17> @fshr_v2i17_constant_splat_poison1(<2 x i17> %x, <2 x i17> %y) { +; CHECK-LABEL: @fshr_v2i17_constant_splat_poison1( ; CHECK-NEXT: [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[Y:%.*]], <2 x i17> [[X:%.*]], <2 x i17> ) ; CHECK-NEXT: ret <2 x i17> [[R]] ; - %shr = lshr <2 x i17> %x, - %shl = shl <2 x i17> %y, + %shr = lshr <2 x i17> %x, + %shl = shl <2 x i17> %y, %r = or <2 x i17> %shr, %shl ret <2 x i17> %r } ; Allow arbitrary shift constants. -; Support undef elements. +; Support poison elements. define <2 x i32> @fshr_v2i32_constant_nonsplat(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @fshr_v2i32_constant_nonsplat( @@ -114,24 +114,24 @@ define <2 x i32> @fshr_v2i32_constant_nonsplat(<2 x i32> %x, <2 x i32> %y) { ret <2 x i32> %r } -define <2 x i32> @fshr_v2i32_constant_nonsplat_undef0(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @fshr_v2i32_constant_nonsplat_undef0( +define <2 x i32> @fshr_v2i32_constant_nonsplat_poison0(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @fshr_v2i32_constant_nonsplat_poison0( ; CHECK-NEXT: [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[Y:%.*]], <2 x i32> [[X:%.*]], <2 x i32> ) ; CHECK-NEXT: ret <2 x i32> [[R]] ; - %shr = lshr <2 x i32> %x, + %shr = lshr <2 x i32> %x, %shl = shl <2 x i32> %y, %r = or <2 x i32> %shl, %shr ret <2 x i32> %r } -define <2 x i32> @fshr_v2i32_constant_nonsplat_undef1(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @fshr_v2i32_constant_nonsplat_undef1( -; CHECK-NEXT: [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[Y:%.*]], <2 x i32> [[X:%.*]], <2 x i32> ) +define <2 x i32> @fshr_v2i32_constant_nonsplat_poison1(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @fshr_v2i32_constant_nonsplat_poison1( +; CHECK-NEXT: [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[Y:%.*]], <2 x i32> [[X:%.*]], <2 x i32> ) ; CHECK-NEXT: ret <2 x i32> [[R]] ; %shr = lshr <2 x i32> %x, - %shl = shl <2 x i32> %y, + %shl = shl <2 x i32> %y, %r = or <2 x i32> %shl, %shr ret <2 x i32> %r } @@ -147,13 +147,13 @@ define <2 x i36> @fshl_v2i36_constant_nonsplat(<2 x i36> %x, <2 x i36> %y) { ret <2 x i36> %r } -define <3 x i36> @fshl_v3i36_constant_nonsplat_undef0(<3 x i36> %x, <3 x i36> %y) { -; CHECK-LABEL: @fshl_v3i36_constant_nonsplat_undef0( -; CHECK-NEXT: [[R:%.*]] = call <3 x i36> @llvm.fshl.v3i36(<3 x i36> [[X:%.*]], <3 x i36> [[Y:%.*]], <3 x i36> ) +define <3 x i36> @fshl_v3i36_constant_nonsplat_poison0(<3 x i36> %x, <3 x i36> %y) { +; CHECK-LABEL: @fshl_v3i36_constant_nonsplat_poison0( +; CHECK-NEXT: [[R:%.*]] = call <3 x i36> @llvm.fshl.v3i36(<3 x i36> [[X:%.*]], <3 x i36> [[Y:%.*]], <3 x i36> ) ; CHECK-NEXT: ret <3 x i36> [[R]] ; - %shl = shl <3 x i36> %x, - %shr = lshr <3 x i36> %y, + %shl = shl <3 x i36> %x, + %shr = lshr <3 x i36> %y, %r = or <3 x i36> %shl, %shr ret <3 x i36> %r } diff --git a/llvm/test/Transforms/InstCombine/get-lowbitmask-upto-and-including-bit.ll b/llvm/test/Transforms/InstCombine/get-lowbitmask-upto-and-including-bit.ll index 12a81f0cd2f0fb..40caa57891369f 100644 --- a/llvm/test/Transforms/InstCombine/get-lowbitmask-upto-and-including-bit.ll +++ b/llvm/test/Transforms/InstCombine/get-lowbitmask-upto-and-including-bit.ll @@ -41,36 +41,36 @@ define <2 x i8> @t2_vec(<2 x i8> %x) { %mask = or <2 x i8> %lowbitmask, %bitmask ret <2 x i8> %mask } -define <3 x i8> @t3_vec_undef0(<3 x i8> %x) { -; CHECK-LABEL: @t3_vec_undef0( +define <3 x i8> @t3_vec_poison0(<3 x i8> %x) { +; CHECK-LABEL: @t3_vec_poison0( ; CHECK-NEXT: [[TMP1:%.*]] = sub <3 x i8> , [[X:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = lshr <3 x i8> , [[TMP1]] ; CHECK-NEXT: ret <3 x i8> [[MASK]] ; - %bitmask = shl <3 x i8> , %x + %bitmask = shl <3 x i8> , %x %lowbitmask = add <3 x i8> %bitmask, %mask = or <3 x i8> %lowbitmask, %bitmask ret <3 x i8> %mask } -define <3 x i8> @t4_vec_undef1(<3 x i8> %x) { -; CHECK-LABEL: @t4_vec_undef1( +define <3 x i8> @t4_vec_poison1(<3 x i8> %x) { +; CHECK-LABEL: @t4_vec_poison1( ; CHECK-NEXT: [[TMP1:%.*]] = sub <3 x i8> , [[X:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = lshr <3 x i8> , [[TMP1]] ; CHECK-NEXT: ret <3 x i8> [[MASK]] ; %bitmask = shl <3 x i8> , %x - %lowbitmask = add <3 x i8> %bitmask, + %lowbitmask = add <3 x i8> %bitmask, %mask = or <3 x i8> %lowbitmask, %bitmask ret <3 x i8> %mask } -define <3 x i8> @t5_vec_undef2(<3 x i8> %x) { -; CHECK-LABEL: @t5_vec_undef2( +define <3 x i8> @t5_vec_poison2(<3 x i8> %x) { +; CHECK-LABEL: @t5_vec_poison2( ; CHECK-NEXT: [[TMP1:%.*]] = sub <3 x i8> , [[X:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = lshr <3 x i8> , [[TMP1]] ; CHECK-NEXT: ret <3 x i8> [[MASK]] ; - %bitmask = shl <3 x i8> , %x - %lowbitmask = add <3 x i8> %bitmask, + %bitmask = shl <3 x i8> , %x + %lowbitmask = add <3 x i8> %bitmask, %mask = or <3 x i8> %lowbitmask, %bitmask ret <3 x i8> %mask } diff --git a/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll b/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll index c8f14595ea6738..e4cae135197830 100644 --- a/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll +++ b/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll @@ -55,14 +55,14 @@ define <2 x i8> @t2_vec(<2 x i8> %x, <2 x i8> %y) { ret <2 x i8> %negbias } -define <2 x i8> @t3_vec_undef(<2 x i8> %x, <2 x i8> %y) { -; CHECK-LABEL: @t3_vec_undef( +define <2 x i8> @t3_vec_poison(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: @t3_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i8> [[Y:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[TMP1]], [[X:%.*]] ; CHECK-NEXT: [[NEGBIAS:%.*]] = sub <2 x i8> zeroinitializer, [[TMP2]] ; CHECK-NEXT: ret <2 x i8> [[NEGBIAS]] ; - %negy = sub <2 x i8> , %y + %negy = sub <2 x i8> , %y %unbiasedx = and <2 x i8> %negy, %x %negbias = sub <2 x i8> %unbiasedx, %x ret <2 x i8> %negbias diff --git a/llvm/test/Transforms/InstCombine/hoist-not-from-ashr-operand.ll b/llvm/test/Transforms/InstCombine/hoist-not-from-ashr-operand.ll index e0242855e26839..2217666f0f49a2 100644 --- a/llvm/test/Transforms/InstCombine/hoist-not-from-ashr-operand.ll +++ b/llvm/test/Transforms/InstCombine/hoist-not-from-ashr-operand.ll @@ -41,14 +41,14 @@ define <2 x i8> @t2_vec(<2 x i8> %x, <2 x i8> %y) { %ashr = ashr <2 x i8> %not_x, %y ret <2 x i8> %ashr } -; Note that we must sanitize undef elts of -1 constant to -1 or 0. -define <2 x i8> @t3_vec_undef(<2 x i8> %x, <2 x i8> %y) { -; CHECK-LABEL: @t3_vec_undef( +; Note that we must sanitize poison elts of -1 constant to -1 or 0. +define <2 x i8> @t3_vec_poison(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: @t3_vec_poison( ; CHECK-NEXT: [[NOT_X_NOT:%.*]] = ashr <2 x i8> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[ASHR:%.*]] = xor <2 x i8> [[NOT_X_NOT]], ; CHECK-NEXT: ret <2 x i8> [[ASHR]] ; - %not_x = xor <2 x i8> %x, + %not_x = xor <2 x i8> %x, %ashr = ashr <2 x i8> %not_x, %y ret <2 x i8> %ashr } diff --git a/llvm/test/Transforms/InstCombine/icmp-uge-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-uge-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll index 5adf476f7a79fd..32ef6267cdf8b4 100644 --- a/llvm/test/Transforms/InstCombine/icmp-uge-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-uge-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll @@ -56,8 +56,8 @@ define <3 x i1> @p2_vec_undef0(<3 x i8> %val, <3 x i8> %bits) { ; CHECK-LABEL: @p2_vec_undef0( ; CHECK-NEXT: [[T0:%.*]] = shl <3 x i8> , [[BITS:%.*]] ; CHECK-NEXT: call void @use3i8(<3 x i8> [[T0]]) -; CHECK-NEXT: [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS]] -; CHECK-NEXT: [[R:%.*]] = icmp eq <3 x i8> [[VAL_HIGHBITS]], zeroinitializer +; CHECK-NEXT: [[T1:%.*]] = add <3 x i8> [[T0]], +; CHECK-NEXT: [[R:%.*]] = icmp uge <3 x i8> [[T1]], [[VAL:%.*]] ; CHECK-NEXT: ret <3 x i1> [[R]] ; %t0 = shl <3 x i8> , %bits diff --git a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll index 7f4603881f23c8..27b02c8c6e9366 100644 --- a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll @@ -40,38 +40,38 @@ define <2 x i1> @p1_vec(<2 x i8> %val, <2 x i8> %bits) { ret <2 x i1> %r } -define <3 x i1> @p2_vec_undef0(<3 x i8> %val, <3 x i8> %bits) { -; CHECK-LABEL: @p2_vec_undef0( +define <3 x i1> @p2_vec_poison0(<3 x i8> %val, <3 x i8> %bits) { +; CHECK-LABEL: @p2_vec_poison0( ; CHECK-NEXT: [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp eq <3 x i8> [[VAL_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[R]] ; - %t0 = shl <3 x i8> , %bits + %t0 = shl <3 x i8> , %bits %t1 = xor <3 x i8> %t0, %r = icmp uge <3 x i8> %t1, %val ret <3 x i1> %r } -define <3 x i1> @p2_vec_undef1(<3 x i8> %val, <3 x i8> %bits) { -; CHECK-LABEL: @p2_vec_undef1( +define <3 x i1> @p2_vec_poison1(<3 x i8> %val, <3 x i8> %bits) { +; CHECK-LABEL: @p2_vec_poison1( ; CHECK-NEXT: [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp eq <3 x i8> [[VAL_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[R]] ; %t0 = shl <3 x i8> , %bits - %t1 = xor <3 x i8> %t0, + %t1 = xor <3 x i8> %t0, %r = icmp uge <3 x i8> %t1, %val ret <3 x i1> %r } -define <3 x i1> @p2_vec_undef2(<3 x i8> %val, <3 x i8> %bits) { -; CHECK-LABEL: @p2_vec_undef2( +define <3 x i1> @p2_vec_poison2(<3 x i8> %val, <3 x i8> %bits) { +; CHECK-LABEL: @p2_vec_poison2( ; CHECK-NEXT: [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp eq <3 x i8> [[VAL_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[R]] ; - %t0 = shl <3 x i8> , %bits - %t1 = xor <3 x i8> %t0, + %t0 = shl <3 x i8> , %bits + %t1 = xor <3 x i8> %t0, %r = icmp uge <3 x i8> %t1, %val ret <3 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/icmp-ugt-of-shl-1-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ugt-of-shl-1-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll index 550e8bb17229f5..72cfb5a9f8bd01 100644 --- a/llvm/test/Transforms/InstCombine/icmp-ugt-of-shl-1-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-ugt-of-shl-1-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll @@ -38,13 +38,13 @@ define <2 x i1> @p1_vec(<2 x i8> %val, <2 x i8> %bits) { ret <2 x i1> %r } -define <3 x i1> @p2_vec_undef(<3 x i8> %val, <3 x i8> %bits) { -; CHECK-LABEL: @p2_vec_undef( +define <3 x i1> @p2_vec_poison(<3 x i8> %val, <3 x i8> %bits) { +; CHECK-LABEL: @p2_vec_poison( ; CHECK-NEXT: [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp eq <3 x i8> [[VAL_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[R]] ; - %t0 = shl <3 x i8> , %bits + %t0 = shl <3 x i8> , %bits %r = icmp ugt <3 x i8> %t0, %val ret <3 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/icmp-ule-of-shl-1-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ule-of-shl-1-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll index 26b667d36728ac..79e6914f095313 100644 --- a/llvm/test/Transforms/InstCombine/icmp-ule-of-shl-1-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-ule-of-shl-1-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll @@ -38,13 +38,13 @@ define <2 x i1> @p1_vec(<2 x i8> %val, <2 x i8> %bits) { ret <2 x i1> %r } -define <3 x i1> @p2_vec_undef(<3 x i8> %val, <3 x i8> %bits) { -; CHECK-LABEL: @p2_vec_undef( +define <3 x i1> @p2_vec_poison(<3 x i8> %val, <3 x i8> %bits) { +; CHECK-LABEL: @p2_vec_poison( ; CHECK-NEXT: [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp ne <3 x i8> [[VAL_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[R]] ; - %t0 = shl <3 x i8> , %bits + %t0 = shl <3 x i8> , %bits %r = icmp ule <3 x i8> %t0, %val ret <3 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/icmp-ult-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ult-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll index dd353d44218bf1..25894a22f00751 100644 --- a/llvm/test/Transforms/InstCombine/icmp-ult-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-ult-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll @@ -56,8 +56,8 @@ define <3 x i1> @p2_vec_undef0(<3 x i8> %val, <3 x i8> %bits) { ; CHECK-LABEL: @p2_vec_undef0( ; CHECK-NEXT: [[T0:%.*]] = shl <3 x i8> , [[BITS:%.*]] ; CHECK-NEXT: call void @use3i8(<3 x i8> [[T0]]) -; CHECK-NEXT: [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS]] -; CHECK-NEXT: [[R:%.*]] = icmp ne <3 x i8> [[VAL_HIGHBITS]], zeroinitializer +; CHECK-NEXT: [[T1:%.*]] = add <3 x i8> [[T0]], +; CHECK-NEXT: [[R:%.*]] = icmp ult <3 x i8> [[T1]], [[VAL:%.*]] ; CHECK-NEXT: ret <3 x i1> [[R]] ; %t0 = shl <3 x i8> , %bits diff --git a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll index c7a45c5cdc11ad..8441033d4857ea 100644 --- a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll @@ -40,38 +40,38 @@ define <2 x i1> @p1_vec(<2 x i8> %val, <2 x i8> %bits) { ret <2 x i1> %r } -define <3 x i1> @p2_vec_undef0(<3 x i8> %val, <3 x i8> %bits) { -; CHECK-LABEL: @p2_vec_undef0( +define <3 x i1> @p2_vec_poison0(<3 x i8> %val, <3 x i8> %bits) { +; CHECK-LABEL: @p2_vec_poison0( ; CHECK-NEXT: [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp ne <3 x i8> [[VAL_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[R]] ; - %t0 = shl <3 x i8> , %bits + %t0 = shl <3 x i8> , %bits %t1 = xor <3 x i8> %t0, %r = icmp ult <3 x i8> %t1, %val ret <3 x i1> %r } -define <3 x i1> @p2_vec_undef1(<3 x i8> %val, <3 x i8> %bits) { -; CHECK-LABEL: @p2_vec_undef1( +define <3 x i1> @p2_vec_poison1(<3 x i8> %val, <3 x i8> %bits) { +; CHECK-LABEL: @p2_vec_poison1( ; CHECK-NEXT: [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp ne <3 x i8> [[VAL_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[R]] ; %t0 = shl <3 x i8> , %bits - %t1 = xor <3 x i8> %t0, + %t1 = xor <3 x i8> %t0, %r = icmp ult <3 x i8> %t1, %val ret <3 x i1> %r } -define <3 x i1> @p2_vec_undef2(<3 x i8> %val, <3 x i8> %bits) { -; CHECK-LABEL: @p2_vec_undef2( +define <3 x i1> @p2_vec_poison2(<3 x i8> %val, <3 x i8> %bits) { +; CHECK-LABEL: @p2_vec_poison2( ; CHECK-NEXT: [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp ne <3 x i8> [[VAL_HIGHBITS]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[R]] ; - %t0 = shl <3 x i8> , %bits - %t1 = xor <3 x i8> %t0, + %t0 = shl <3 x i8> , %bits + %t1 = xor <3 x i8> %t0, %r = icmp ult <3 x i8> %t1, %val ret <3 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll index 10ab1fe118348c..31093c7ca1036c 100644 --- a/llvm/test/Transforms/InstCombine/icmp.ll +++ b/llvm/test/Transforms/InstCombine/icmp.ll @@ -1790,14 +1790,14 @@ define <2 x i1> @icmp_add20_eq_add57_splat(<2 x i32> %x, <2 x i32> %y) { ret <2 x i1> %cmp } -define <2 x i1> @icmp_add20_eq_add57_undef(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @icmp_add20_eq_add57_undef( +define <2 x i1> @icmp_add20_eq_add57_poison(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @icmp_add20_eq_add57_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[Y:%.*]], ; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i32> [[TMP1]], [[X:%.*]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %1 = add <2 x i32> %x, - %2 = add <2 x i32> %y, + %2 = add <2 x i32> %y, %cmp = icmp eq <2 x i32> %1, %2 ret <2 x i1> %cmp } @@ -1838,14 +1838,14 @@ define <2 x i1> @icmp_sub57_ne_sub20_splat(<2 x i32> %x, <2 x i32> %y) { ret <2 x i1> %cmp } -define <2 x i1> @icmp_sub57_ne_sub20_vec_undef(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @icmp_sub57_ne_sub20_vec_undef( +define <2 x i1> @icmp_sub57_ne_sub20_vec_poison(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @icmp_sub57_ne_sub20_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i32> [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; - %1 = add <2 x i32> %x, - %2 = add <2 x i32> %y, + %1 = add <2 x i32> %x, + %2 = add <2 x i32> %y, %cmp = icmp ne <2 x i32> %1, %2 ret <2 x i1> %cmp } @@ -1926,14 +1926,14 @@ define <2 x i1> @icmp_add20_sge_add57_splat(<2 x i32> %x, <2 x i32> %y) { ret <2 x i1> %cmp } -define <2 x i1> @icmp_add20_sge_add57_undef(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @icmp_add20_sge_add57_undef( +define <2 x i1> @icmp_add20_sge_add57_poison(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @icmp_add20_sge_add57_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[Y:%.*]], ; CHECK-NEXT: [[CMP:%.*]] = icmp sle <2 x i32> [[TMP1]], [[X:%.*]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %1 = add nsw <2 x i32> %x, - %2 = add nsw <2 x i32> %y, + %2 = add nsw <2 x i32> %y, %cmp = icmp sge <2 x i32> %1, %2 ret <2 x i1> %cmp } @@ -1975,14 +1975,14 @@ define <2 x i1> @icmp_sub57_sge_sub20_splat(<2 x i32> %x, <2 x i32> %y) { ret <2 x i1> %cmp } -define <2 x i1> @icmp_sub57_sge_sub20_vec_undef(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @icmp_sub57_sge_sub20_vec_undef( +define <2 x i1> @icmp_sub57_sge_sub20_vec_poison(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @icmp_sub57_sge_sub20_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[CMP:%.*]] = icmp sge <2 x i32> [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; - %1 = add nsw <2 x i32> %x, - %2 = add nsw <2 x i32> %y, + %1 = add nsw <2 x i32> %x, + %2 = add nsw <2 x i32> %y, %cmp = icmp sge <2 x i32> %1, %2 ret <2 x i1> %cmp } @@ -2557,13 +2557,13 @@ define <2 x i1> @or_icmp_eq_B_0_icmp_ult_A_B_uniform(<2 x i64> %a, <2 x i64> %b) ret <2 x i1> %3 } -define <2 x i1> @or_icmp_eq_B_0_icmp_ult_A_B_undef(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: @or_icmp_eq_B_0_icmp_ult_A_B_undef( +define <2 x i1> @or_icmp_eq_B_0_icmp_ult_A_B_poison(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: @or_icmp_eq_B_0_icmp_ult_A_B_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[B:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = icmp uge <2 x i64> [[TMP1]], [[A:%.*]] ; CHECK-NEXT: ret <2 x i1> [[TMP2]] ; - %1 = icmp eq <2 x i64> %b, + %1 = icmp eq <2 x i64> %b, %2 = icmp ult <2 x i64> %a, %b %3 = or <2 x i1> %1, %2 ret <2 x i1> %3 @@ -2606,14 +2606,14 @@ define <2 x i1> @or_icmp_ne_A_0_icmp_ne_B_0_uniform(<2 x i64> %a, <2 x i64> %b) ret <2 x i1> %3 } -define <2 x i1> @or_icmp_ne_A_0_icmp_ne_B_0_undef(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: @or_icmp_ne_A_0_icmp_ne_B_0_undef( +define <2 x i1> @or_icmp_ne_A_0_icmp_ne_B_0_poison(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: @or_icmp_ne_A_0_icmp_ne_B_0_poison( ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[TMP2]] ; - %1 = icmp ne <2 x i64> %a, - %2 = icmp ne <2 x i64> %b, + %1 = icmp ne <2 x i64> %a, + %2 = icmp ne <2 x i64> %b, %3 = or <2 x i1> %1, %2 ret <2 x i1> %3 } @@ -2803,13 +2803,13 @@ define <2 x i1> @and_icmp_ne_B_0_icmp_uge_A_B_uniform(<2 x i64> %a, <2 x i64> %b ret <2 x i1> %3 } -define <2 x i1> @and_icmp_ne_B_0_icmp_uge_A_B_undef(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: @and_icmp_ne_B_0_icmp_uge_A_B_undef( +define <2 x i1> @and_icmp_ne_B_0_icmp_uge_A_B_poison(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: @and_icmp_ne_B_0_icmp_uge_A_B_poison( ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[B:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <2 x i64> [[TMP1]], [[A:%.*]] ; CHECK-NEXT: ret <2 x i1> [[TMP2]] ; - %1 = icmp ne <2 x i64> %b, + %1 = icmp ne <2 x i64> %b, %2 = icmp uge <2 x i64> %a, %b %3 = and <2 x i1> %1, %2 ret <2 x i1> %3 @@ -3272,13 +3272,13 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec_nonuniform(<2 x i32> %x) { ret <2 x i1> %ret } -define <2 x i1> @icmp_and_or_lshr_cst_vec_undef(<2 x i32> %x) { -; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_undef( +define <2 x i1> @icmp_and_or_lshr_cst_vec_poison(<2 x i32> %x) { +; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; - %shf = lshr <2 x i32> %x, + %shf = lshr <2 x i32> %x, %or = or <2 x i32> %shf, %x %and = and <2 x i32> %or, %ret = icmp ne <2 x i32> %and, zeroinitializer @@ -3315,15 +3315,15 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec_nonuniform_commute(<2 x i32> %xp) { ret <2 x i1> %ret } -define <2 x i1> @icmp_and_or_lshr_cst_vec_undef_commute(<2 x i32> %xp) { -; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_undef_commute( +define <2 x i1> @icmp_and_or_lshr_cst_vec_poison_commute(<2 x i32> %xp) { +; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_poison_commute( ; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X]], ; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %x = srem <2 x i32> %xp, ; prevent complexity-based canonicalization - %shf = lshr <2 x i32> %x, + %shf = lshr <2 x i32> %x, %or = or <2 x i32> %x, %shf %and = and <2 x i32> %or, %ret = icmp ne <2 x i32> %and, zeroinitializer @@ -4360,7 +4360,7 @@ define <2 x i1> @signbit_false_logic(<2 x i5> %x) { ; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i5> [[X:%.*]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %dec = add <2 x i5> %x, + %dec = add <2 x i5> %x, %not = xor <2 x i5> %x, %and = and <2 x i5> %dec, %not %r = icmp sgt <2 x i5> %and, diff --git a/llvm/test/Transforms/InstCombine/integer-round-up-pow2-alignment.ll b/llvm/test/Transforms/InstCombine/integer-round-up-pow2-alignment.ll index 7cef922eaf0ce2..c7e0553992b909 100644 --- a/llvm/test/Transforms/InstCombine/integer-round-up-pow2-alignment.ll +++ b/llvm/test/Transforms/InstCombine/integer-round-up-pow2-alignment.ll @@ -86,9 +86,9 @@ define <2 x i8> @t4_splat(<2 x i8> %x) { ret <2 x i8> %x.roundedup } -; Splat-with-undef -define <2 x i8> @t5_splat_undef_0b0001(<2 x i8> %x) { -; CHECK-LABEL: @t5_splat_undef_0b0001( +; Splat-with-poison +define <2 x i8> @t5_splat_poison_0b0001(<2 x i8> %x) { +; CHECK-LABEL: @t5_splat_poison_0b0001( ; CHECK-NEXT: [[X_BIASED1:%.*]] = add <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[X_ROUNDEDUP:%.*]] = and <2 x i8> [[X_BIASED1]], ; CHECK-NEXT: ret <2 x i8> [[X_ROUNDEDUP]] @@ -96,43 +96,43 @@ define <2 x i8> @t5_splat_undef_0b0001(<2 x i8> %x) { %x.lowbits = and <2 x i8> %x, %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, %x.biased = add <2 x i8> %x, - %x.biased.highbits = and <2 x i8> %x.biased, + %x.biased.highbits = and <2 x i8> %x.biased, %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits ret <2 x i8> %x.roundedup } -define <2 x i8> @t5_splat_undef_0b0010(<2 x i8> %x) { -; CHECK-LABEL: @t5_splat_undef_0b0010( +define <2 x i8> @t5_splat_poison_0b0010(<2 x i8> %x) { +; CHECK-LABEL: @t5_splat_poison_0b0010( ; CHECK-NEXT: [[X_BIASED1:%.*]] = add <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[X_ROUNDEDUP:%.*]] = and <2 x i8> [[X_BIASED1]], ; CHECK-NEXT: ret <2 x i8> [[X_ROUNDEDUP]] ; %x.lowbits = and <2 x i8> %x, %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, - %x.biased = add <2 x i8> %x, + %x.biased = add <2 x i8> %x, %x.biased.highbits = and <2 x i8> %x.biased, %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits ret <2 x i8> %x.roundedup } -define <2 x i8> @t5_splat_undef_0b0100(<2 x i8> %x) { -; CHECK-LABEL: @t5_splat_undef_0b0100( +define <2 x i8> @t5_splat_poison_0b0100(<2 x i8> %x) { +; CHECK-LABEL: @t5_splat_poison_0b0100( ; CHECK-NEXT: [[X_BIASED:%.*]] = add <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[X_ROUNDEDUP:%.*]] = and <2 x i8> [[X_BIASED]], ; CHECK-NEXT: ret <2 x i8> [[X_ROUNDEDUP]] ; %x.lowbits = and <2 x i8> %x, - %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, + %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, %x.biased = add <2 x i8> %x, %x.biased.highbits = and <2 x i8> %x.biased, %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits ret <2 x i8> %x.roundedup } -define <2 x i8> @t5_splat_undef_0b1000(<2 x i8> %x) { -; CHECK-LABEL: @t5_splat_undef_0b1000( +define <2 x i8> @t5_splat_poison_0b1000(<2 x i8> %x) { +; CHECK-LABEL: @t5_splat_poison_0b1000( ; CHECK-NEXT: [[X_BIASED:%.*]] = add <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[X_ROUNDEDUP:%.*]] = and <2 x i8> [[X_BIASED]], ; CHECK-NEXT: ret <2 x i8> [[X_ROUNDEDUP]] ; - %x.lowbits = and <2 x i8> %x, + %x.lowbits = and <2 x i8> %x, %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, %x.biased = add <2 x i8> %x, %x.biased.highbits = and <2 x i8> %x.biased, @@ -177,64 +177,64 @@ define <2 x i8> @t7_nonsplat_bias(<2 x i8> %x) { } ; Splat-in-disguise vector tests -define <2 x i8> @t8_nonsplat_masked_by_undef_0b0001(<2 x i8> %x) { -; CHECK-LABEL: @t8_nonsplat_masked_by_undef_0b0001( +define <2 x i8> @t8_nonsplat_masked_by_poison_0b0001(<2 x i8> %x) { +; CHECK-LABEL: @t8_nonsplat_masked_by_poison_0b0001( ; CHECK-NEXT: [[X_LOWBITS:%.*]] = and <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[X_LOWBITS_ARE_ZERO:%.*]] = icmp eq <2 x i8> [[X_LOWBITS]], zeroinitializer ; CHECK-NEXT: [[X_BIASED:%.*]] = add <2 x i8> [[X]], -; CHECK-NEXT: [[X_BIASED_HIGHBITS:%.*]] = and <2 x i8> [[X_BIASED]], +; CHECK-NEXT: [[X_BIASED_HIGHBITS:%.*]] = and <2 x i8> [[X_BIASED]], ; CHECK-NEXT: [[X_ROUNDEDUP:%.*]] = select <2 x i1> [[X_LOWBITS_ARE_ZERO]], <2 x i8> [[X]], <2 x i8> [[X_BIASED_HIGHBITS]] ; CHECK-NEXT: ret <2 x i8> [[X_ROUNDEDUP]] ; %x.lowbits = and <2 x i8> %x, %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, %x.biased = add <2 x i8> %x, - %x.biased.highbits = and <2 x i8> %x.biased, + %x.biased.highbits = and <2 x i8> %x.biased, %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits ret <2 x i8> %x.roundedup } -define <2 x i8> @t8_nonsplat_masked_by_undef_0b0010(<2 x i8> %x) { -; CHECK-LABEL: @t8_nonsplat_masked_by_undef_0b0010( +define <2 x i8> @t8_nonsplat_masked_by_poison_0b0010(<2 x i8> %x) { +; CHECK-LABEL: @t8_nonsplat_masked_by_poison_0b0010( ; CHECK-NEXT: [[X_LOWBITS:%.*]] = and <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[X_LOWBITS_ARE_ZERO:%.*]] = icmp eq <2 x i8> [[X_LOWBITS]], zeroinitializer -; CHECK-NEXT: [[X_BIASED:%.*]] = add <2 x i8> [[X]], +; CHECK-NEXT: [[X_BIASED:%.*]] = add <2 x i8> [[X]], ; CHECK-NEXT: [[X_BIASED_HIGHBITS:%.*]] = and <2 x i8> [[X_BIASED]], ; CHECK-NEXT: [[X_ROUNDEDUP:%.*]] = select <2 x i1> [[X_LOWBITS_ARE_ZERO]], <2 x i8> [[X]], <2 x i8> [[X_BIASED_HIGHBITS]] ; CHECK-NEXT: ret <2 x i8> [[X_ROUNDEDUP]] ; %x.lowbits = and <2 x i8> %x, %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, - %x.biased = add <2 x i8> %x, + %x.biased = add <2 x i8> %x, %x.biased.highbits = and <2 x i8> %x.biased, %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits ret <2 x i8> %x.roundedup } -define <2 x i8> @t8_nonsplat_masked_by_undef_0b0100(<2 x i8> %x) { -; CHECK-LABEL: @t8_nonsplat_masked_by_undef_0b0100( +define <2 x i8> @t8_nonsplat_masked_by_poison_0b0100(<2 x i8> %x) { +; CHECK-LABEL: @t8_nonsplat_masked_by_poison_0b0100( ; CHECK-NEXT: [[X_LOWBITS:%.*]] = and <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[X_LOWBITS_ARE_ZERO:%.*]] = icmp eq <2 x i8> [[X_LOWBITS]], +; CHECK-NEXT: [[X_LOWBITS_ARE_ZERO:%.*]] = icmp eq <2 x i8> [[X_LOWBITS]], ; CHECK-NEXT: [[X_BIASED:%.*]] = add <2 x i8> [[X]], ; CHECK-NEXT: [[X_BIASED_HIGHBITS:%.*]] = and <2 x i8> [[X_BIASED]], ; CHECK-NEXT: [[X_ROUNDEDUP:%.*]] = select <2 x i1> [[X_LOWBITS_ARE_ZERO]], <2 x i8> [[X]], <2 x i8> [[X_BIASED_HIGHBITS]] ; CHECK-NEXT: ret <2 x i8> [[X_ROUNDEDUP]] ; %x.lowbits = and <2 x i8> %x, - %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, + %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, %x.biased = add <2 x i8> %x, %x.biased.highbits = and <2 x i8> %x.biased, %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits ret <2 x i8> %x.roundedup } -define <2 x i8> @t8_nonsplat_masked_by_undef_0b1000(<2 x i8> %x) { -; CHECK-LABEL: @t8_nonsplat_masked_by_undef_0b1000( -; CHECK-NEXT: [[X_LOWBITS:%.*]] = and <2 x i8> [[X:%.*]], +define <2 x i8> @t8_nonsplat_masked_by_poison_0b1000(<2 x i8> %x) { +; CHECK-LABEL: @t8_nonsplat_masked_by_poison_0b1000( +; CHECK-NEXT: [[X_LOWBITS:%.*]] = and <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[X_LOWBITS_ARE_ZERO:%.*]] = icmp eq <2 x i8> [[X_LOWBITS]], zeroinitializer ; CHECK-NEXT: [[X_BIASED:%.*]] = add <2 x i8> [[X]], ; CHECK-NEXT: [[X_BIASED_HIGHBITS:%.*]] = and <2 x i8> [[X_BIASED]], ; CHECK-NEXT: [[X_ROUNDEDUP:%.*]] = select <2 x i1> [[X_LOWBITS_ARE_ZERO]], <2 x i8> [[X]], <2 x i8> [[X_BIASED_HIGHBITS]] ; CHECK-NEXT: ret <2 x i8> [[X_ROUNDEDUP]] ; - %x.lowbits = and <2 x i8> %x, + %x.lowbits = and <2 x i8> %x, %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, %x.biased = add <2 x i8> %x, %x.biased.highbits = and <2 x i8> %x.biased, @@ -442,28 +442,28 @@ define i8 @t17_oneuse(i8 %x) { define <2 x i4> @t18_replacement_0b0001(<2 x i4> %x) { ; CHECK-LABEL: @t18_replacement_0b0001( ; CHECK-NEXT: [[X_BIASED:%.*]] = add <2 x i4> [[X:%.*]], -; CHECK-NEXT: [[X_BIASED_HIGHBITS:%.*]] = and <2 x i4> [[X_BIASED]], +; CHECK-NEXT: [[X_BIASED_HIGHBITS:%.*]] = and <2 x i4> [[X_BIASED]], ; CHECK-NEXT: call void @use.v2i4(<2 x i4> [[X_BIASED_HIGHBITS]]) ; CHECK-NEXT: ret <2 x i4> [[X_BIASED_HIGHBITS]] ; %x.lowbits = and <2 x i4> %x, %x.lowbits.are.zero = icmp eq <2 x i4> %x.lowbits, %x.biased = add <2 x i4> %x, - %x.biased.highbits = and <2 x i4> %x.biased, + %x.biased.highbits = and <2 x i4> %x.biased, call void @use.v2i4(<2 x i4> %x.biased.highbits) %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i4> %x, <2 x i4> %x.biased.highbits ret <2 x i4> %x.roundedup } define <2 x i4> @t18_replacement_0b0010(<2 x i4> %x) { ; CHECK-LABEL: @t18_replacement_0b0010( -; CHECK-NEXT: [[X_BIASED:%.*]] = add <2 x i4> [[X:%.*]], +; CHECK-NEXT: [[X_BIASED:%.*]] = add <2 x i4> [[X:%.*]], ; CHECK-NEXT: [[X_BIASED_HIGHBITS:%.*]] = and <2 x i4> [[X_BIASED]], ; CHECK-NEXT: call void @use.v2i4(<2 x i4> [[X_BIASED_HIGHBITS]]) ; CHECK-NEXT: ret <2 x i4> [[X_BIASED_HIGHBITS]] ; %x.lowbits = and <2 x i4> %x, %x.lowbits.are.zero = icmp eq <2 x i4> %x.lowbits, - %x.biased = add <2 x i4> %x, + %x.biased = add <2 x i4> %x, %x.biased.highbits = and <2 x i4> %x.biased, call void @use.v2i4(<2 x i4> %x.biased.highbits) %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i4> %x, <2 x i4> %x.biased.highbits @@ -477,7 +477,7 @@ define <2 x i4> @t18_replacement_0b0100(<2 x i4> %x) { ; CHECK-NEXT: ret <2 x i4> [[X_BIASED_HIGHBITS]] ; %x.lowbits = and <2 x i4> %x, - %x.lowbits.are.zero = icmp eq <2 x i4> %x.lowbits, + %x.lowbits.are.zero = icmp eq <2 x i4> %x.lowbits, %x.biased = add <2 x i4> %x, %x.biased.highbits = and <2 x i4> %x.biased, call void @use.v2i4(<2 x i4> %x.biased.highbits) @@ -491,7 +491,7 @@ define <2 x i4> @t18_replacement_0b1000(<2 x i4> %x) { ; CHECK-NEXT: call void @use.v2i4(<2 x i4> [[X_BIASED_HIGHBITS]]) ; CHECK-NEXT: ret <2 x i4> [[X_BIASED_HIGHBITS]] ; - %x.lowbits = and <2 x i4> %x, + %x.lowbits = and <2 x i4> %x, %x.lowbits.are.zero = icmp eq <2 x i4> %x.lowbits, %x.biased = add <2 x i4> %x, %x.biased.highbits = and <2 x i4> %x.biased, diff --git a/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll b/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll index 486113202ddd75..a76662c4bc4395 100644 --- a/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll +++ b/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll @@ -20,14 +20,14 @@ define <2 x i4> @vector (<2 x i4> %x, <2 x i4> %y, <2 x i4> %m) { ret <2 x i4> %r } -define <3 x i4> @vector_undef (<3 x i4> %x, <3 x i4> %y, <3 x i4> %m) { -; CHECK-LABEL: @vector_undef( +define <3 x i4> @vector_poison (<3 x i4> %x, <3 x i4> %y, <3 x i4> %m) { +; CHECK-LABEL: @vector_poison( ; CHECK-NEXT: [[N0:%.*]] = xor <3 x i4> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[TMP1:%.*]] = and <3 x i4> [[N0]], [[M:%.*]] ; CHECK-NEXT: [[R:%.*]] = xor <3 x i4> [[TMP1]], [[X]] ; CHECK-NEXT: ret <3 x i4> [[R]] ; - %im = xor <3 x i4> %m, + %im = xor <3 x i4> %m, %n0 = xor <3 x i4> %x, %y %n1 = and <3 x i4> %n0, %im %r = xor <3 x i4> %n1, %y @@ -78,17 +78,17 @@ define <2 x i4> @in_constant_varx_6_invmask_nonsplat(<2 x i4> %x, <2 x i4> %mask ret <2 x i4> %r } -define <3 x i4> @in_constant_varx_6_invmask_undef(<3 x i4> %x, <3 x i4> %mask) { -; CHECK-LABEL: @in_constant_varx_6_invmask_undef( -; CHECK-NEXT: [[N0:%.*]] = xor <3 x i4> [[X:%.*]], +define <3 x i4> @in_constant_varx_6_invmask_poison(<3 x i4> %x, <3 x i4> %mask) { +; CHECK-LABEL: @in_constant_varx_6_invmask_poison( +; CHECK-NEXT: [[N0:%.*]] = xor <3 x i4> [[X:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <3 x i4> [[N0]], [[MASK:%.*]] ; CHECK-NEXT: [[R:%.*]] = xor <3 x i4> [[TMP1]], [[X]] ; CHECK-NEXT: ret <3 x i4> [[R]] ; - %notmask = xor <3 x i4> %mask, - %n0 = xor <3 x i4> %x, ; %x + %notmask = xor <3 x i4> %mask, + %n0 = xor <3 x i4> %x, ; %x %n1 = and <3 x i4> %n0, %notmask - %r = xor <3 x i4> %n1, + %r = xor <3 x i4> %n1, ret <3 x i4> %r } @@ -133,15 +133,15 @@ define <2 x i4> @in_constant_6_vary_invmask_nonsplat(<2 x i4> %y, <2 x i4> %mask ret <2 x i4> %r } -define <3 x i4> @in_constant_6_vary_invmask_undef(<3 x i4> %y, <3 x i4> %mask) { -; CHECK-LABEL: @in_constant_6_vary_invmask_undef( -; CHECK-NEXT: [[N0:%.*]] = xor <3 x i4> [[Y:%.*]], +define <3 x i4> @in_constant_6_vary_invmask_poison(<3 x i4> %y, <3 x i4> %mask) { +; CHECK-LABEL: @in_constant_6_vary_invmask_poison( +; CHECK-NEXT: [[N0:%.*]] = xor <3 x i4> [[Y:%.*]], ; CHECK-NEXT: [[TMP1:%.*]] = and <3 x i4> [[N0]], [[MASK:%.*]] -; CHECK-NEXT: [[R:%.*]] = xor <3 x i4> [[TMP1]], +; CHECK-NEXT: [[R:%.*]] = xor <3 x i4> [[TMP1]], ; CHECK-NEXT: ret <3 x i4> [[R]] ; - %notmask = xor <3 x i4> %mask, - %n0 = xor <3 x i4> %y, ; %x + %notmask = xor <3 x i4> %mask, + %n0 = xor <3 x i4> %y, ; %x %n1 = and <3 x i4> %n0, %notmask %r = xor <3 x i4> %n1, %y ret <3 x i4> %r diff --git a/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll index 847a7940bad8c7..5d058b20be7207 100644 --- a/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll +++ b/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll @@ -81,39 +81,39 @@ define <4 x i1> @vec_4xi32_lshr_and_negC_eq(<4 x i32> %x, <4 x i32> %y) { ret <4 x i1> %r } -define <4 x i1> @vec_lshr_and_negC_eq_undef1(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_lshr_and_negC_eq_undef1( +define <4 x i1> @vec_lshr_and_negC_eq_poison1(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_lshr_and_negC_eq_poison1( ; CHECK-NEXT: [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp ult <4 x i32> [[LSHR]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %lshr = lshr <4 x i32> %x, %y - %and = and <4 x i32> %lshr, ; ~7 + %and = and <4 x i32> %lshr, ; ~7 %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } -define <4 x i1> @vec_lshr_and_negC_eq_undef2(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_lshr_and_negC_eq_undef2( +define <4 x i1> @vec_lshr_and_negC_eq_poison2(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_lshr_and_negC_eq_poison2( ; CHECK-NEXT: [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp ult <4 x i32> [[LSHR]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %lshr = lshr <4 x i32> %x, %y %and = and <4 x i32> %lshr, ; ~7 - %r = icmp eq <4 x i32> %and, + %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } -define <4 x i1> @vec_lshr_and_negC_eq_undef3(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_lshr_and_negC_eq_undef3( +define <4 x i1> @vec_lshr_and_negC_eq_poison3(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_lshr_and_negC_eq_poison3( ; CHECK-NEXT: [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp ult <4 x i32> [[LSHR]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %lshr = lshr <4 x i32> %x, %y - %and = and <4 x i32> %lshr, ; ~7 - %r = icmp eq <4 x i32> %and, + %and = and <4 x i32> %lshr, ; ~7 + %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/lshr-and-signbit-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/lshr-and-signbit-icmpeq-zero.ll index 39f4e58b25dc84..0166680309ea8c 100644 --- a/llvm/test/Transforms/InstCombine/lshr-and-signbit-icmpeq-zero.ll +++ b/llvm/test/Transforms/InstCombine/lshr-and-signbit-icmpeq-zero.ll @@ -81,39 +81,39 @@ define <4 x i1> @vec_4xi32_lshr_and_signbit_eq(<4 x i32> %x, <4 x i32> %y) { ret <4 x i1> %r } -define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_undef1(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_undef1( +define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_poison1(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_poison1( ; CHECK-NEXT: [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt <4 x i32> [[LSHR]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %lshr = lshr <4 x i32> %x, %y - %and = and <4 x i32> %lshr, + %and = and <4 x i32> %lshr, %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } -define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_undef2(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_undef2( +define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_poison2(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_poison2( ; CHECK-NEXT: [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt <4 x i32> [[LSHR]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %lshr = lshr <4 x i32> %x, %y %and = and <4 x i32> %lshr, - %r = icmp eq <4 x i32> %and, + %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } -define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_undef3(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_undef3( +define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_poison3(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_poison3( ; CHECK-NEXT: [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt <4 x i32> [[LSHR]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %lshr = lshr <4 x i32> %x, %y - %and = and <4 x i32> %lshr, - %r = icmp eq <4 x i32> %and, + %and = and <4 x i32> %lshr, + %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/masked-merge-add.ll b/llvm/test/Transforms/InstCombine/masked-merge-add.ll index f655153108a436..0484369e99d6a5 100644 --- a/llvm/test/Transforms/InstCombine/masked-merge-add.ll +++ b/llvm/test/Transforms/InstCombine/masked-merge-add.ll @@ -51,7 +51,7 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) ; CHECK-NEXT: [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor <3 x i32> [[M]], ; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]] +; CHECK-NEXT: [[RET:%.*]] = add <3 x i32> [[AND]], [[AND1]] ; CHECK-NEXT: ret <3 x i32> [[RET]] ; %and = and <3 x i32> %x, %m @@ -61,6 +61,21 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) ret <3 x i32> %ret } +define <3 x i32> @p_vec_poison(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) { +; CHECK-LABEL: @p_vec_poison( +; CHECK-NEXT: [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]] +; CHECK-NEXT: [[NEG:%.*]] = xor <3 x i32> [[M]], +; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]] +; CHECK-NEXT: ret <3 x i32> [[RET]] +; + %and = and <3 x i32> %x, %m + %neg = xor <3 x i32> %m, + %and1 = and <3 x i32> %neg, %y + %ret = add <3 x i32> %and, %and1 + ret <3 x i32> %ret +} + ; ============================================================================ ; ; Constant mask. ; ============================================================================ ; diff --git a/llvm/test/Transforms/InstCombine/masked-merge-or.ll b/llvm/test/Transforms/InstCombine/masked-merge-or.ll index b49ec07706e284..0531a532fc7e0a 100644 --- a/llvm/test/Transforms/InstCombine/masked-merge-or.ll +++ b/llvm/test/Transforms/InstCombine/masked-merge-or.ll @@ -51,7 +51,7 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) ; CHECK-NEXT: [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor <3 x i32> [[M]], ; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]] +; CHECK-NEXT: [[RET:%.*]] = or <3 x i32> [[AND]], [[AND1]] ; CHECK-NEXT: ret <3 x i32> [[RET]] ; %and = and <3 x i32> %x, %m @@ -61,6 +61,21 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) ret <3 x i32> %ret } +define <3 x i32> @p_vec_poison(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) { +; CHECK-LABEL: @p_vec_poison( +; CHECK-NEXT: [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]] +; CHECK-NEXT: [[NEG:%.*]] = xor <3 x i32> [[M]], +; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]] +; CHECK-NEXT: ret <3 x i32> [[RET]] +; + %and = and <3 x i32> %x, %m + %neg = xor <3 x i32> %m, + %and1 = and <3 x i32> %neg, %y + %ret = or <3 x i32> %and, %and1 + ret <3 x i32> %ret +} + ; ============================================================================ ; ; Constant mask. ; ============================================================================ ; diff --git a/llvm/test/Transforms/InstCombine/masked-merge-xor.ll b/llvm/test/Transforms/InstCombine/masked-merge-xor.ll index a6d201be68cee5..74cc7625aebff5 100644 --- a/llvm/test/Transforms/InstCombine/masked-merge-xor.ll +++ b/llvm/test/Transforms/InstCombine/masked-merge-xor.ll @@ -51,7 +51,7 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) ; CHECK-NEXT: [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]] ; CHECK-NEXT: [[NEG:%.*]] = xor <3 x i32> [[M]], ; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]] -; CHECK-NEXT: [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]] +; CHECK-NEXT: [[RET:%.*]] = xor <3 x i32> [[AND]], [[AND1]] ; CHECK-NEXT: ret <3 x i32> [[RET]] ; %and = and <3 x i32> %x, %m @@ -61,6 +61,21 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) ret <3 x i32> %ret } +define <3 x i32> @p_vec_poison(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) { +; CHECK-LABEL: @p_vec_poison( +; CHECK-NEXT: [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]] +; CHECK-NEXT: [[NEG:%.*]] = xor <3 x i32> [[M]], +; CHECK-NEXT: [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]] +; CHECK-NEXT: [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]] +; CHECK-NEXT: ret <3 x i32> [[RET]] +; + %and = and <3 x i32> %x, %m + %neg = xor <3 x i32> %m, + %and1 = and <3 x i32> %neg, %y + %ret = xor <3 x i32> %and, %and1 + ret <3 x i32> %ret +} + ; ============================================================================ ; ; Constant mask. ; ============================================================================ ; diff --git a/llvm/test/Transforms/InstCombine/min-positive.ll b/llvm/test/Transforms/InstCombine/min-positive.ll index 1fb212b7387254..d2c2e9018792bd 100644 --- a/llvm/test/Transforms/InstCombine/min-positive.ll +++ b/llvm/test/Transforms/InstCombine/min-positive.ll @@ -67,16 +67,16 @@ define <2 x i1> @smin_commute_vec(<2 x i32> %x, <2 x i32> %other) { ret <2 x i1> %test } -define <2 x i1> @smin_commute_vec_undef_elts(<2 x i32> %x, <2 x i32> %other) { -; CHECK-LABEL: @smin_commute_vec_undef_elts( -; CHECK-NEXT: [[TEST:%.*]] = icmp sgt <2 x i32> [[OTHER:%.*]], +define <2 x i1> @smin_commute_vec_poison_elts(<2 x i32> %x, <2 x i32> %other) { +; CHECK-LABEL: @smin_commute_vec_poison_elts( +; CHECK-NEXT: [[TEST:%.*]] = icmp sgt <2 x i32> [[OTHER:%.*]], ; CHECK-NEXT: ret <2 x i1> [[TEST]] ; %notneg = and <2 x i32> %x, %positive = or <2 x i32> %notneg, %cmp = icmp slt <2 x i32> %other, %positive %sel = select <2 x i1> %cmp, <2 x i32> %other, <2 x i32> %positive - %test = icmp sgt <2 x i32> %sel, + %test = icmp sgt <2 x i32> %sel, ret <2 x i1> %test } ; %positive might be zero diff --git a/llvm/test/Transforms/InstCombine/minmax-fold.ll b/llvm/test/Transforms/InstCombine/minmax-fold.ll index bbbbf9eb6eafe4..8b47dc7a28079e 100644 --- a/llvm/test/Transforms/InstCombine/minmax-fold.ll +++ b/llvm/test/Transforms/InstCombine/minmax-fold.ll @@ -1360,14 +1360,15 @@ define i8 @PR14613_smax(i8 %x) { define i8 @PR46271(<2 x i8> %x) { ; CHECK-LABEL: @PR46271( -; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[X:%.*]], <2 x i8> ) +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[A_INV:%.*]] = icmp slt <2 x i8> [[X]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> [[A_INV]], <2 x i8> , <2 x i8> [[TMP3]] ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i64 1 -; CHECK-NEXT: [[R:%.*]] = xor i8 [[TMP2]], -1 -; CHECK-NEXT: ret i8 [[R]] +; CHECK-NEXT: ret i8 [[TMP2]] ; %a = icmp sgt <2 x i8> %x, - %b = select <2 x i1> %a, <2 x i8> %x, <2 x i8> - %not = xor <2 x i8> %b, + %b = select <2 x i1> %a, <2 x i8> %x, <2 x i8> + %not = xor <2 x i8> %b, %r = extractelement <2 x i8> %not, i32 1 ret i8 %r } diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll index bd1a47bbfcc193..a76f0f84ba3401 100644 --- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll @@ -393,7 +393,7 @@ define i8 @smax_of_nots(i8 %x, i8 %y) { ret i8 %m } -; Vectors are ok (including undef lanes of not ops) +; Vectors are ok (including poison lanes of not ops) define <3 x i8> @smin_of_nots(<3 x i8> %x, <3 x i8> %y) { ; CHECK-LABEL: @smin_of_nots( @@ -401,8 +401,8 @@ define <3 x i8> @smin_of_nots(<3 x i8> %x, <3 x i8> %y) { ; CHECK-NEXT: [[M:%.*]] = xor <3 x i8> [[TMP1]], ; CHECK-NEXT: ret <3 x i8> [[M]] ; - %notx = xor <3 x i8> %x, - %noty = xor <3 x i8> %y, + %notx = xor <3 x i8> %x, + %noty = xor <3 x i8> %y, %m = call <3 x i8> @llvm.smin.v3i8(<3 x i8> %notx, <3 x i8> %noty) ret <3 x i8> %m } @@ -473,16 +473,16 @@ define i8 @smax_of_not_and_const(i8 %x) { ret i8 %m } -; Vectors are ok (including undef lanes of not ops and min/max constant operand) +; Vectors are ok (including poison lanes of not ops and min/max constant operand) define <3 x i8> @smin_of_not_and_const(<3 x i8> %x) { ; CHECK-LABEL: @smin_of_not_and_const( -; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i8> @llvm.smax.v3i8(<3 x i8> [[X:%.*]], <3 x i8> ) +; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i8> @llvm.smax.v3i8(<3 x i8> [[X:%.*]], <3 x i8> ) ; CHECK-NEXT: [[M:%.*]] = xor <3 x i8> [[TMP1]], ; CHECK-NEXT: ret <3 x i8> [[M]] ; - %notx = xor <3 x i8> %x, - %m = call <3 x i8> @llvm.smin.v3i8(<3 x i8> , <3 x i8> %notx) + %notx = xor <3 x i8> %x, + %m = call <3 x i8> @llvm.smin.v3i8(<3 x i8> , <3 x i8> %notx) ret <3 x i8> %m } @@ -706,7 +706,7 @@ define <3 x i8> @smax_negation_vec(<3 x i8> %x) { ; CHECK-NEXT: [[R:%.*]] = call <3 x i8> @llvm.abs.v3i8(<3 x i8> [[X:%.*]], i1 false) ; CHECK-NEXT: ret <3 x i8> [[R]] ; - %s = sub <3 x i8> , %x + %s = sub <3 x i8> , %x %r = call <3 x i8> @llvm.smax.v3i8(<3 x i8> %x, <3 x i8> %s) ret <3 x i8> %r } @@ -912,7 +912,7 @@ define <3 x i8> @umin_non_zero_idiom4(<3 x i8> %a) { ; CHECK-NEXT: [[RES:%.*]] = zext <3 x i1> [[TMP1]] to <3 x i8> ; CHECK-NEXT: ret <3 x i8> [[RES]] ; - %res = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %a, <3 x i8> ) + %res = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %a, <3 x i8> ) ret <3 x i8> %res } @@ -2118,15 +2118,15 @@ define i8 @umin_offset_uses(i8 %x) { ret i8 %m } -; TODO: This could transform, but undef element must not propagate to the new add. +; TODO: This could transform -define <3 x i8> @umax_vector_splat_undef(<3 x i8> %x) { -; CHECK-LABEL: @umax_vector_splat_undef( -; CHECK-NEXT: [[A:%.*]] = add nuw <3 x i8> [[X:%.*]], +define <3 x i8> @umax_vector_splat_poison(<3 x i8> %x) { +; CHECK-LABEL: @umax_vector_splat_poison( +; CHECK-NEXT: [[A:%.*]] = add nuw <3 x i8> [[X:%.*]], ; CHECK-NEXT: [[R:%.*]] = call <3 x i8> @llvm.umax.v3i8(<3 x i8> [[A]], <3 x i8> ) ; CHECK-NEXT: ret <3 x i8> [[R]] ; - %a = add nuw <3 x i8> %x, + %a = add nuw <3 x i8> %x, %r = call <3 x i8> @llvm.umax.v3i8(<3 x i8> %a, <3 x i8> ) ret <3 x i8> %r } @@ -2506,8 +2506,8 @@ entry: ret i8 %val } -define <3 x i8> @fold_umax_with_knownbits_info_undef_in_splat(<3 x i8> %a, <3 x i8> %b) { -; CHECK-LABEL: @fold_umax_with_knownbits_info_undef_in_splat( +define <3 x i8> @fold_umax_with_knownbits_info_poison_in_splat(<3 x i8> %a, <3 x i8> %b) { +; CHECK-LABEL: @fold_umax_with_knownbits_info_poison_in_splat( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A1:%.*]] = or <3 x i8> [[A:%.*]], ; CHECK-NEXT: [[A2:%.*]] = shl <3 x i8> [[B:%.*]], @@ -2518,7 +2518,7 @@ entry: %a1 = or <3 x i8> %a, %a2 = shl <3 x i8> %b, %sub = sub <3 x i8> %a1, %a2 - %val = call <3 x i8> @llvm.umax.v3i8(<3 x i8> %sub, <3 x i8> ) + %val = call <3 x i8> @llvm.umax.v3i8(<3 x i8> %sub, <3 x i8> ) ret <3 x i8> %val } @@ -2535,8 +2535,8 @@ entry: ret i8 %val } -define <3 x i8> @fold_umin_with_knownbits_info_undef_in_splat(<3 x i8> %a, <3 x i8> %b) { -; CHECK-LABEL: @fold_umin_with_knownbits_info_undef_in_splat( +define <3 x i8> @fold_umin_with_knownbits_info_poison_in_splat(<3 x i8> %a, <3 x i8> %b) { +; CHECK-LABEL: @fold_umin_with_knownbits_info_poison_in_splat( ; CHECK-NEXT: entry: ; CHECK-NEXT: ret <3 x i8> ; @@ -2544,7 +2544,7 @@ entry: %a1 = or <3 x i8> %a, %a2 = shl <3 x i8> %b, %sub = sub <3 x i8> %a1, %a2 - %val = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %sub, <3 x i8> ) + %val = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %sub, <3 x i8> ) ret <3 x i8> %val } diff --git a/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll b/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll index 8fe4261bbf009c..f47c5577075cbd 100644 --- a/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll @@ -784,7 +784,7 @@ define <2 x i8> @negate_if_false_commute(<2 x i8> %px, <2 x i1> %cond) { ; CHECK-NEXT: ret <2 x i8> [[R]] ; %x = sdiv <2 x i8> , %px ; thwart complexity-based canonicalization - %sel = select <2 x i1> %cond, <2 x i8> , <2 x i8> + %sel = select <2 x i1> %cond, <2 x i8> , <2 x i8> %r = mul <2 x i8> %x, %sel ret <2 x i8> %r } @@ -931,7 +931,7 @@ define @mul_scalable_splat_zero( %z) { ; CHECK-LABEL: @mul_scalable_splat_zero( ; CHECK-NEXT: ret zeroinitializer ; - %shuf = shufflevector insertelement ( undef, i64 0, i32 0), poison, zeroinitializer + %shuf = shufflevector insertelement ( poison, i64 0, i32 0), poison, zeroinitializer %t3 = mul %shuf, %z ret %t3 } @@ -973,14 +973,14 @@ define <2 x i32> @mulsub1_vec_nonuniform(<2 x i32> %a0, <2 x i32> %a1) { ret <2 x i32> %mul } -define <2 x i32> @mulsub1_vec_nonuniform_undef(<2 x i32> %a0, <2 x i32> %a1) { -; CHECK-LABEL: @mulsub1_vec_nonuniform_undef( +define <2 x i32> @mulsub1_vec_nonuniform_poison(<2 x i32> %a0, <2 x i32> %a1) { +; CHECK-LABEL: @mulsub1_vec_nonuniform_poison( ; CHECK-NEXT: [[SUB_NEG:%.*]] = sub <2 x i32> [[A0:%.*]], [[A1:%.*]] ; CHECK-NEXT: [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], ; CHECK-NEXT: ret <2 x i32> [[MUL]] ; %sub = sub <2 x i32> %a1, %a0 - %mul = mul <2 x i32> %sub, + %mul = mul <2 x i32> %sub, ret <2 x i32> %mul } @@ -1017,14 +1017,14 @@ define <2 x i32> @mulsub2_vec_nonuniform(<2 x i32> %a0) { ret <2 x i32> %mul } -define <2 x i32> @mulsub2_vec_nonuniform_undef(<2 x i32> %a0) { -; CHECK-LABEL: @mulsub2_vec_nonuniform_undef( +define <2 x i32> @mulsub2_vec_nonuniform_poison(<2 x i32> %a0) { +; CHECK-LABEL: @mulsub2_vec_nonuniform_poison( ; CHECK-NEXT: [[SUB_NEG:%.*]] = add <2 x i32> [[A0:%.*]], ; CHECK-NEXT: [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], ; CHECK-NEXT: ret <2 x i32> [[MUL]] ; %sub = sub <2 x i32> , %a0 - %mul = mul <2 x i32> %sub, + %mul = mul <2 x i32> %sub, ret <2 x i32> %mul } @@ -1061,14 +1061,14 @@ define <2 x i32> @muladd2_vec_nonuniform(<2 x i32> %a0) { ret <2 x i32> %mul } -define <2 x i32> @muladd2_vec_nonuniform_undef(<2 x i32> %a0) { -; CHECK-LABEL: @muladd2_vec_nonuniform_undef( +define <2 x i32> @muladd2_vec_nonuniform_poison(<2 x i32> %a0) { +; CHECK-LABEL: @muladd2_vec_nonuniform_poison( ; CHECK-NEXT: [[ADD_NEG:%.*]] = sub <2 x i32> , [[A0:%.*]] ; CHECK-NEXT: [[MUL:%.*]] = shl <2 x i32> [[ADD_NEG]], ; CHECK-NEXT: ret <2 x i32> [[MUL]] ; %add = add <2 x i32> %a0, - %mul = mul <2 x i32> %add, + %mul = mul <2 x i32> %add, ret <2 x i32> %mul } diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll index d4a689c60786e0..227ca4a6d5cfad 100644 --- a/llvm/test/Transforms/InstCombine/mul.ll +++ b/llvm/test/Transforms/InstCombine/mul.ll @@ -1496,7 +1496,7 @@ define <2 x i8> @negate_if_false_commute(<2 x i8> %px, <2 x i1> %cond) { ; CHECK-NEXT: ret <2 x i8> [[R]] ; %x = sdiv <2 x i8> , %px ; thwart complexity-based canonicalization - %sel = select <2 x i1> %cond, <2 x i8> , <2 x i8> + %sel = select <2 x i1> %cond, <2 x i8> , <2 x i8> %r = mul <2 x i8> %x, %sel ret <2 x i8> %r } @@ -1643,7 +1643,7 @@ define @mul_scalable_splat_zero( %z) { ; CHECK-LABEL: @mul_scalable_splat_zero( ; CHECK-NEXT: ret zeroinitializer ; - %shuf = shufflevector insertelement ( undef, i64 0, i32 0), undef, zeroinitializer + %shuf = shufflevector insertelement ( poison, i64 0, i32 0), poison, zeroinitializer %t3 = mul %shuf, %z ret %t3 } @@ -1752,14 +1752,14 @@ define <2 x i32> @mulsub1_vec_nonuniform(<2 x i32> %a0, <2 x i32> %a1) { ret <2 x i32> %mul } -define <2 x i32> @mulsub1_vec_nonuniform_undef(<2 x i32> %a0, <2 x i32> %a1) { -; CHECK-LABEL: @mulsub1_vec_nonuniform_undef( +define <2 x i32> @mulsub1_vec_nonuniform_poison(<2 x i32> %a0, <2 x i32> %a1) { +; CHECK-LABEL: @mulsub1_vec_nonuniform_poison( ; CHECK-NEXT: [[SUB_NEG:%.*]] = sub <2 x i32> [[A0:%.*]], [[A1:%.*]] ; CHECK-NEXT: [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], ; CHECK-NEXT: ret <2 x i32> [[MUL]] ; %sub = sub <2 x i32> %a1, %a0 - %mul = mul <2 x i32> %sub, + %mul = mul <2 x i32> %sub, ret <2 x i32> %mul } @@ -1796,14 +1796,14 @@ define <2 x i32> @mulsub2_vec_nonuniform(<2 x i32> %a0) { ret <2 x i32> %mul } -define <2 x i32> @mulsub2_vec_nonuniform_undef(<2 x i32> %a0) { -; CHECK-LABEL: @mulsub2_vec_nonuniform_undef( +define <2 x i32> @mulsub2_vec_nonuniform_poison(<2 x i32> %a0) { +; CHECK-LABEL: @mulsub2_vec_nonuniform_poison( ; CHECK-NEXT: [[SUB_NEG:%.*]] = add <2 x i32> [[A0:%.*]], ; CHECK-NEXT: [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], ; CHECK-NEXT: ret <2 x i32> [[MUL]] ; %sub = sub <2 x i32> , %a0 - %mul = mul <2 x i32> %sub, + %mul = mul <2 x i32> %sub, ret <2 x i32> %mul } @@ -1819,15 +1819,15 @@ define i8 @mulsub_nsw(i8 %a1, i8 %a2) { } ; It would be safe to keep the nsw on the shl here, but only because the mul -; to shl transform happens to replace undef with 0. -define <2 x i8> @mulsub_nsw_undef(<2 x i8> %a1, <2 x i8> %a2) { -; CHECK-LABEL: @mulsub_nsw_undef( +; to shl transform happens to replace poison with 0. +define <2 x i8> @mulsub_nsw_poison(<2 x i8> %a1, <2 x i8> %a2) { +; CHECK-LABEL: @mulsub_nsw_poison( ; CHECK-NEXT: [[A_NEG:%.*]] = sub nsw <2 x i8> [[A2:%.*]], [[A1:%.*]] ; CHECK-NEXT: [[MUL:%.*]] = shl <2 x i8> [[A_NEG]], ; CHECK-NEXT: ret <2 x i8> [[MUL]] ; %a = sub nsw <2 x i8> %a1, %a2 - %mul = mul nsw <2 x i8> %a, + %mul = mul nsw <2 x i8> %a, ret <2 x i8> %mul } @@ -1864,14 +1864,14 @@ define <2 x i32> @muladd2_vec_nonuniform(<2 x i32> %a0) { ret <2 x i32> %mul } -define <2 x i32> @muladd2_vec_nonuniform_undef(<2 x i32> %a0) { -; CHECK-LABEL: @muladd2_vec_nonuniform_undef( +define <2 x i32> @muladd2_vec_nonuniform_poison(<2 x i32> %a0) { +; CHECK-LABEL: @muladd2_vec_nonuniform_poison( ; CHECK-NEXT: [[ADD_NEG:%.*]] = sub <2 x i32> , [[A0:%.*]] ; CHECK-NEXT: [[MUL:%.*]] = shl <2 x i32> [[ADD_NEG]], ; CHECK-NEXT: ret <2 x i32> [[MUL]] ; %add = add <2 x i32> %a0, - %mul = mul <2 x i32> %add, + %mul = mul <2 x i32> %add, ret <2 x i32> %mul } diff --git a/llvm/test/Transforms/InstCombine/not-add.ll b/llvm/test/Transforms/InstCombine/not-add.ll index 877f558ffd5037..9ba37b6bba39ea 100644 --- a/llvm/test/Transforms/InstCombine/not-add.ll +++ b/llvm/test/Transforms/InstCombine/not-add.ll @@ -115,26 +115,26 @@ define <4 x i32> @vector_test(<4 x i32> %x, <4 x i32> %y) { ret <4 x i32> %nota } -define <4 x i32> @vector_test_undef(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vector_test_undef( +define <4 x i32> @vector_test_poison(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vector_test_poison( ; CHECK-NEXT: [[NOTA:%.*]] = sub <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <4 x i32> [[NOTA]] ; - %notx = xor <4 x i32> %x, + %notx = xor <4 x i32> %x, %a = add <4 x i32> %notx, %y - %nota = xor <4 x i32> %a, + %nota = xor <4 x i32> %a, ret <4 x i32> %nota } -define <4 x i32> @vector_test_undef_nsw_nuw(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vector_test_undef_nsw_nuw( +define <4 x i32> @vector_test_poison_nsw_nuw(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vector_test_poison_nsw_nuw( ; CHECK-NEXT: [[NOTA:%.*]] = sub nuw nsw <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <4 x i32> [[NOTA]] ; - %notx = xor <4 x i32> %x, + %notx = xor <4 x i32> %x, %a = add nsw nuw <4 x i32> %notx, %y - %nota = xor <4 x i32> %a, + %nota = xor <4 x i32> %a, ret <4 x i32> %nota } diff --git a/llvm/test/Transforms/InstCombine/not.ll b/llvm/test/Transforms/InstCombine/not.ll index 98b5d980415602..0c2c6195e32407 100644 --- a/llvm/test/Transforms/InstCombine/not.ll +++ b/llvm/test/Transforms/InstCombine/not.ll @@ -430,9 +430,9 @@ define <3 x i5> @not_or_neg_commute_vec(<3 x i5> %x, <3 x i5> %p) { ; CHECK-NEXT: ret <3 x i5> [[NOT]] ; %y = mul <3 x i5> %p, ; thwart complexity-based-canonicalization - %s = sub <3 x i5> , %x + %s = sub <3 x i5> , %x %o = or <3 x i5> %y, %s - %not = xor <3 x i5> %o, + %not = xor <3 x i5> %o, ret <3 x i5> %not } diff --git a/llvm/test/Transforms/InstCombine/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/Transforms/InstCombine/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll index c16633efe4ce3c..3fd4a17d972af4 100644 --- a/llvm/test/Transforms/InstCombine/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/Transforms/InstCombine/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -95,41 +95,41 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) { ret <4 x i1> %t2 } -define <4 x i1> @p6_vector_urem_by_const__nonsplat_undef0(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @p6_vector_urem_by_const__nonsplat_undef0( -; CHECK-NEXT: [[T0:%.*]] = and <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[T1:%.*]] = urem <4 x i32> [[T0]], -; CHECK-NEXT: [[T2:%.*]] = icmp eq <4 x i32> [[T1]], zeroinitializer +; The poison value in the vector makes the whole function UB. + +define <4 x i1> @p6_vector_urem_by_const__nonsplat_poison0(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @p6_vector_urem_by_const__nonsplat_poison0( +; CHECK-NEXT: [[T0:%.*]] = and <4 x i32> [[X:%.*]], +; CHECK-NEXT: [[T2:%.*]] = icmp eq <4 x i32> [[T0]], zeroinitializer ; CHECK-NEXT: ret <4 x i1> [[T2]] ; - %t0 = and <4 x i32> %x, + %t0 = and <4 x i32> %x, %t1 = urem <4 x i32> %t0, ; '6' is clearly not a power of two %t2 = icmp eq <4 x i32> %t1, ret <4 x i1> %t2 } -define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @p7_vector_urem_by_const__nonsplat_undef2( +define <4 x i1> @p7_vector_urem_by_const__nonsplat_poison2(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @p7_vector_urem_by_const__nonsplat_poison2( ; CHECK-NEXT: [[T0:%.*]] = and <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[T2:%.*]] = icmp eq <4 x i32> [[T0]], +; CHECK-NEXT: [[T2:%.*]] = icmp eq <4 x i32> [[T0]], ; CHECK-NEXT: ret <4 x i1> [[T2]] ; %t0 = and <4 x i32> %x, ; clearly a power-of-two or zero %t1 = urem <4 x i32> %t0, ; '6' is clearly not a power of two - %t2 = icmp eq <4 x i32> %t1, + %t2 = icmp eq <4 x i32> %t1, ret <4 x i1> %t2 } -define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @p8_vector_urem_by_const__nonsplat_undef3( -; CHECK-NEXT: [[T0:%.*]] = and <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[T1:%.*]] = urem <4 x i32> [[T0]], -; CHECK-NEXT: [[T2:%.*]] = icmp eq <4 x i32> [[T1]], +define <4 x i1> @p8_vector_urem_by_const__nonsplat_poison3(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @p8_vector_urem_by_const__nonsplat_poison3( +; CHECK-NEXT: [[T0:%.*]] = and <4 x i32> [[X:%.*]], +; CHECK-NEXT: [[T2:%.*]] = icmp eq <4 x i32> [[T0]], ; CHECK-NEXT: ret <4 x i1> [[T2]] ; - %t0 = and <4 x i32> %x, + %t0 = and <4 x i32> %x, %t1 = urem <4 x i32> %t0, ; '6' is clearly not a power of two - %t2 = icmp eq <4 x i32> %t1, + %t2 = icmp eq <4 x i32> %t1, ret <4 x i1> %t2 } diff --git a/llvm/test/Transforms/InstCombine/operand-complexity.ll b/llvm/test/Transforms/InstCombine/operand-complexity.ll index 62cfc76d9d24e9..541a15275b6170 100644 --- a/llvm/test/Transforms/InstCombine/operand-complexity.ll +++ b/llvm/test/Transforms/InstCombine/operand-complexity.ll @@ -29,15 +29,15 @@ define <2 x i8> @neg_vec(<2 x i8> %x) { ret <2 x i8> %r } -define <2 x i8> @neg_vec_undef(<2 x i8> %x) { -; CHECK-LABEL: @neg_vec_undef( +define <2 x i8> @neg_vec_poison(<2 x i8> %x) { +; CHECK-LABEL: @neg_vec_poison( ; CHECK-NEXT: [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[NEGX:%.*]] = sub <2 x i8> , [[X]] +; CHECK-NEXT: [[NEGX:%.*]] = sub <2 x i8> , [[X]] ; CHECK-NEXT: [[R:%.*]] = xor <2 x i8> [[BO]], [[NEGX]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %bo = udiv <2 x i8> %x, - %negx = sub <2 x i8> , %x + %negx = sub <2 x i8> , %x %r = xor <2 x i8> %negx, %bo ret <2 x i8> %r } @@ -70,15 +70,15 @@ define <2 x i8> @not_vec(<2 x i8> %x) { ret <2 x i8> %r } -define <2 x i8> @not_vec_undef(<2 x i8> %x) { -; CHECK-LABEL: @not_vec_undef( +define <2 x i8> @not_vec_poison(<2 x i8> %x) { +; CHECK-LABEL: @not_vec_poison( ; CHECK-NEXT: [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], -; CHECK-NEXT: [[NOTX:%.*]] = xor <2 x i8> [[X]], +; CHECK-NEXT: [[NOTX:%.*]] = xor <2 x i8> [[X]], ; CHECK-NEXT: [[R:%.*]] = mul <2 x i8> [[BO]], [[NOTX]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %bo = udiv <2 x i8> %x, - %notx = xor <2 x i8> , %x + %notx = xor <2 x i8> , %x %r = mul <2 x i8> %notx, %bo ret <2 x i8> %r } @@ -134,8 +134,8 @@ define <2 x float> @fneg_vec(<2 x float> %x) { ret <2 x float> %r } -define <2 x float> @fneg_vec_undef(<2 x float> %x) { -; CHECK-LABEL: @fneg_vec_undef( +define <2 x float> @fneg_vec_poison(<2 x float> %x) { +; CHECK-LABEL: @fneg_vec_poison( ; CHECK-NEXT: [[BO:%.*]] = fdiv <2 x float> [[X:%.*]], ; CHECK-NEXT: [[FNEGX:%.*]] = fneg <2 x float> [[X]] ; CHECK-NEXT: [[R:%.*]] = fmul <2 x float> [[BO]], [[FNEGX]] @@ -143,7 +143,7 @@ define <2 x float> @fneg_vec_undef(<2 x float> %x) { ; CHECK-NEXT: ret <2 x float> [[R]] ; %bo = fdiv <2 x float> %x, - %fnegx = fsub <2 x float> , %x + %fnegx = fsub <2 x float> , %x %r = fmul <2 x float> %fnegx, %bo call void @use_vec(<2 x float> %fnegx) ret <2 x float> %r diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll index 1b1a6ffbf0f2d3..6e2085a8bb6c7d 100644 --- a/llvm/test/Transforms/InstCombine/or.ll +++ b/llvm/test/Transforms/InstCombine/or.ll @@ -262,26 +262,26 @@ define <2 x i1> @and_icmp_eq_0_vector(<2 x i32> %A, <2 x i32> %B) { ret <2 x i1> %D } -define <2 x i1> @and_icmp_eq_0_vector_undef1(<2 x i32> %A, <2 x i32> %B) { -; CHECK-LABEL: @and_icmp_eq_0_vector_undef1( +define <2 x i1> @and_icmp_eq_0_vector_poison1(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: @and_icmp_eq_0_vector_poison1( ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[D:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[D]] ; - %C1 = icmp eq <2 x i32> %A, - %C2 = icmp eq <2 x i32> %B, + %C1 = icmp eq <2 x i32> %A, + %C2 = icmp eq <2 x i32> %B, %D = and <2 x i1> %C1, %C2 ret <2 x i1> %D } -define <2 x i1> @and_icmp_eq_0_vector_undef2(<2 x i32> %A, <2 x i32> %B) { -; CHECK-LABEL: @and_icmp_eq_0_vector_undef2( +define <2 x i1> @and_icmp_eq_0_vector_poison2(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: @and_icmp_eq_0_vector_poison2( ; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[D:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[D]] ; - %C1 = icmp eq <2 x i32> %A, - %C2 = icmp eq <2 x i32> %B, + %C1 = icmp eq <2 x i32> %A, + %C2 = icmp eq <2 x i32> %B, %D = and <2 x i1> %C1, %C2 ret <2 x i1> %D } @@ -566,17 +566,17 @@ define <2 x i1> @test37_uniform(<2 x i32> %x) { ret <2 x i1> %ret1 } -define <2 x i1> @test37_undef(<2 x i32> %x) { -; CHECK-LABEL: @test37_undef( -; CHECK-NEXT: [[ADD1:%.*]] = add <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <2 x i32> [[ADD1]], -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq <2 x i32> [[X]], +define <2 x i1> @test37_poison(<2 x i32> %x) { +; CHECK-LABEL: @test37_poison( +; CHECK-NEXT: [[ADD1:%.*]] = add <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <2 x i32> [[ADD1]], +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq <2 x i32> [[X]], ; CHECK-NEXT: [[RET1:%.*]] = or <2 x i1> [[CMP1]], [[CMP2]] ; CHECK-NEXT: ret <2 x i1> [[RET1]] ; - %add1 = add <2 x i32> %x, - %cmp1 = icmp ult <2 x i32> %add1, - %cmp2 = icmp eq <2 x i32> %x, + %add1 = add <2 x i32> %x, + %cmp1 = icmp ult <2 x i32> %add1, + %cmp2 = icmp eq <2 x i32> %x, %ret1 = or <2 x i1> %cmp1, %cmp2 ret <2 x i1> %ret1 } @@ -874,19 +874,19 @@ define <2 x i1> @test46_uniform(<2 x i8> %c) { ret <2 x i1> %or } -define <2 x i1> @test46_undef(<2 x i8> %c) { -; CHECK-LABEL: @test46_undef( -; CHECK-NEXT: [[C_OFF:%.*]] = add <2 x i8> [[C:%.*]], -; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <2 x i8> [[C_OFF]], -; CHECK-NEXT: [[C_OFF17:%.*]] = add <2 x i8> [[C]], -; CHECK-NEXT: [[CMP2:%.*]] = icmp ult <2 x i8> [[C_OFF17]], +define <2 x i1> @test46_poison(<2 x i8> %c) { +; CHECK-LABEL: @test46_poison( +; CHECK-NEXT: [[C_OFF:%.*]] = add <2 x i8> [[C:%.*]], +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult <2 x i8> [[C_OFF]], +; CHECK-NEXT: [[C_OFF17:%.*]] = add <2 x i8> [[C]], +; CHECK-NEXT: [[CMP2:%.*]] = icmp ult <2 x i8> [[C_OFF17]], ; CHECK-NEXT: [[OR:%.*]] = or <2 x i1> [[CMP1]], [[CMP2]] ; CHECK-NEXT: ret <2 x i1> [[OR]] ; - %c.off = add <2 x i8> %c, - %cmp1 = icmp ult <2 x i8> %c.off, - %c.off17 = add <2 x i8> %c, - %cmp2 = icmp ult <2 x i8> %c.off17, + %c.off = add <2 x i8> %c, + %cmp1 = icmp ult <2 x i8> %c.off, + %c.off17 = add <2 x i8> %c, + %cmp2 = icmp ult <2 x i8> %c.off17, %or = or <2 x i1> %cmp1, %cmp2 ret <2 x i1> %or } diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll index f0c2f129e3df3e..5ed7d641df65be 100644 --- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll +++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll @@ -89,13 +89,13 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) { ret <8 x i32> %t7 } -define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { -; CHECK-LABEL: @t2_vec_splat_undef( -; CHECK-NEXT: [[T0:%.*]] = add <8 x i32> [[NBITS:%.*]], +define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) { +; CHECK-LABEL: @t2_vec_splat_poison( +; CHECK-NEXT: [[T0:%.*]] = add <8 x i32> [[NBITS:%.*]], ; CHECK-NEXT: [[T1:%.*]] = zext <8 x i32> [[T0]] to <8 x i64> -; CHECK-NEXT: [[T2:%.*]] = shl <8 x i64> , [[T1]] -; CHECK-NEXT: [[T3:%.*]] = xor <8 x i64> [[T2]], -; CHECK-NEXT: [[T4:%.*]] = sub <8 x i32> , [[NBITS]] +; CHECK-NEXT: [[T2:%.*]] = shl nsw <8 x i64> , [[T1]] +; CHECK-NEXT: [[T3:%.*]] = xor <8 x i64> [[T2]], +; CHECK-NEXT: [[T4:%.*]] = sub <8 x i32> , [[NBITS]] ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T2]]) @@ -106,11 +106,11 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: [[T7:%.*]] = and <8 x i32> [[TMP2]], ; CHECK-NEXT: ret <8 x i32> [[T7]] ; - %t0 = add <8 x i32> %nbits, + %t0 = add <8 x i32> %nbits, %t1 = zext <8 x i32> %t0 to <8 x i64> - %t2 = shl <8 x i64> , %t1 ; shifting by nbits-1 - %t3 = xor <8 x i64> %t2, - %t4 = sub <8 x i32> , %nbits + %t2 = shl <8 x i64> , %t1 ; shifting by nbits-1 + %t3 = xor <8 x i64> %t2, + %t4 = sub <8 x i32> , %nbits call void @use8xi32(<8 x i32> %t0) call void @use8xi64(<8 x i64> %t1) diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll index 46d1de5781b71c..1a711e58c333be 100644 --- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll +++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll @@ -73,11 +73,11 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) { ret <8 x i32> %t5 } -define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { -; CHECK-LABEL: @t2_vec_splat_undef( +define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) { +; CHECK-LABEL: @t2_vec_splat_poison( ; CHECK-NEXT: [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64> -; CHECK-NEXT: [[T1:%.*]] = lshr <8 x i64> , [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add <8 x i32> [[NBITS]], +; CHECK-NEXT: [[T1:%.*]] = lshr <8 x i64> , [[T0]] +; CHECK-NEXT: [[T2:%.*]] = add <8 x i32> [[NBITS]], ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T2]]) @@ -87,8 +87,8 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: ret <8 x i32> [[T5]] ; %t0 = zext <8 x i32> %nbits to <8 x i64> - %t1 = lshr <8 x i64> , %t0 - %t2 = add <8 x i32> %nbits, + %t1 = lshr <8 x i64> , %t0 + %t2 = add <8 x i32> %nbits, call void @use8xi64(<8 x i64> %t0) call void @use8xi64(<8 x i64> %t1) @@ -103,8 +103,8 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-LABEL: @t3_vec_nonsplat( ; CHECK-NEXT: [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64> -; CHECK-NEXT: [[T1:%.*]] = lshr <8 x i64> , [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add <8 x i32> [[NBITS]], +; CHECK-NEXT: [[T1:%.*]] = lshr <8 x i64> , [[T0]] +; CHECK-NEXT: [[T2:%.*]] = add <8 x i32> [[NBITS]], ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T2]]) @@ -114,8 +114,8 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: ret <8 x i32> [[T5]] ; %t0 = zext <8 x i32> %nbits to <8 x i64> - %t1 = lshr <8 x i64> , %t0 - %t2 = add <8 x i32> %nbits, + %t1 = lshr <8 x i64> , %t0 + %t2 = add <8 x i32> %nbits, call void @use8xi64(<8 x i64> %t0) call void @use8xi64(<8 x i64> %t1) diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll index 48873852cfc7cd..cd0098ecdb0a6a 100644 --- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll +++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll @@ -81,12 +81,12 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) { ret <8 x i32> %t6 } -define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { -; CHECK-LABEL: @t2_vec_splat_undef( +define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) { +; CHECK-LABEL: @t2_vec_splat_poison( ; CHECK-NEXT: [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64> -; CHECK-NEXT: [[T1:%.*]] = shl <8 x i64> , [[T0]] +; CHECK-NEXT: [[T1:%.*]] = shl nsw <8 x i64> , [[T0]] ; CHECK-NEXT: [[T2:%.*]] = lshr <8 x i64> , [[T0]] -; CHECK-NEXT: [[T3:%.*]] = add <8 x i32> [[NBITS]], +; CHECK-NEXT: [[T3:%.*]] = add <8 x i32> [[NBITS]], ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T2]]) @@ -97,9 +97,9 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: ret <8 x i32> [[T6]] ; %t0 = zext <8 x i32> %nbits to <8 x i64> - %t1 = shl <8 x i64> , %t0 + %t1 = shl <8 x i64> , %t0 %t2 = lshr <8 x i64> %t1, %t0 - %t3 = add <8 x i32> %nbits, + %t3 = add <8 x i32> %nbits, call void @use8xi64(<8 x i64> %t0) call void @use8xi64(<8 x i64> %t1) @@ -115,9 +115,9 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-LABEL: @t3_vec_nonsplat( ; CHECK-NEXT: [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64> -; CHECK-NEXT: [[T1:%.*]] = shl <8 x i64> , [[T0]] +; CHECK-NEXT: [[T1:%.*]] = shl nsw <8 x i64> , [[T0]] ; CHECK-NEXT: [[T2:%.*]] = lshr <8 x i64> , [[T0]] -; CHECK-NEXT: [[T3:%.*]] = add <8 x i32> [[NBITS]], +; CHECK-NEXT: [[T3:%.*]] = add <8 x i32> [[NBITS]], ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T2]]) @@ -128,9 +128,9 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: ret <8 x i32> [[T6]] ; %t0 = zext <8 x i32> %nbits to <8 x i64> - %t1 = shl <8 x i64> , %t0 + %t1 = shl <8 x i64> , %t0 %t2 = lshr <8 x i64> %t1, %t0 - %t3 = add <8 x i32> %nbits, + %t3 = add <8 x i32> %nbits, call void @use8xi64(<8 x i64> %t0) call void @use8xi64(<8 x i64> %t1) diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll index 8b3f01bcb76913..1debf111b18cd7 100644 --- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll +++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll @@ -71,12 +71,12 @@ define <8 x i32> @t1_vec_splat(<8 x i32> %x, <8 x i32> %nbits) { ret <8 x i32> %t5 } -define <8 x i32> @t1_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) { -; CHECK-LABEL: @t1_vec_splat_undef( -; CHECK-NEXT: [[T0:%.*]] = add <8 x i32> [[NBITS:%.*]], -; CHECK-NEXT: [[T1:%.*]] = shl <8 x i32> , [[T0]] -; CHECK-NEXT: [[T2:%.*]] = xor <8 x i32> [[T1]], -; CHECK-NEXT: [[T4:%.*]] = sub <8 x i32> , [[NBITS]] +define <8 x i32> @t1_vec_splat_poison(<8 x i32> %x, <8 x i32> %nbits) { +; CHECK-LABEL: @t1_vec_splat_poison( +; CHECK-NEXT: [[T0:%.*]] = add <8 x i32> [[NBITS:%.*]], +; CHECK-NEXT: [[T1:%.*]] = shl nsw <8 x i32> , [[T0]] +; CHECK-NEXT: [[T2:%.*]] = xor <8 x i32> [[T1]], +; CHECK-NEXT: [[T4:%.*]] = sub <8 x i32> , [[NBITS]] ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T0]]) ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T1]]) ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T2]]) @@ -85,11 +85,11 @@ define <8 x i32> @t1_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) { ; CHECK-NEXT: [[T5:%.*]] = and <8 x i32> [[TMP1]], ; CHECK-NEXT: ret <8 x i32> [[T5]] ; - %t0 = add <8 x i32> %nbits, - %t1 = shl <8 x i32> , %t0 - %t2 = xor <8 x i32> %t1, + %t0 = add <8 x i32> %nbits, + %t1 = shl <8 x i32> , %t0 + %t2 = xor <8 x i32> %t1, %t3 = and <8 x i32> %t2, %x - %t4 = sub <8 x i32> , %nbits + %t4 = sub <8 x i32> , %nbits call void @use8xi32(<8 x i32> %t0) call void @use8xi32(<8 x i32> %t1) call void @use8xi32(<8 x i32> %t2) diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll index 58a905063fac46..55d0b3f80a519b 100644 --- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll +++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll @@ -55,19 +55,19 @@ define <8 x i32> @t1_vec_splat(<8 x i32> %x, <8 x i32> %nbits) { ret <8 x i32> %t3 } -define <8 x i32> @t1_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) { -; CHECK-LABEL: @t1_vec_splat_undef( -; CHECK-NEXT: [[T0:%.*]] = lshr <8 x i32> , [[NBITS:%.*]] -; CHECK-NEXT: [[T2:%.*]] = add <8 x i32> [[NBITS]], +define <8 x i32> @t1_vec_splat_poison(<8 x i32> %x, <8 x i32> %nbits) { +; CHECK-LABEL: @t1_vec_splat_poison( +; CHECK-NEXT: [[T0:%.*]] = lshr <8 x i32> , [[NBITS:%.*]] +; CHECK-NEXT: [[T2:%.*]] = add <8 x i32> [[NBITS]], ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T0]]) ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T2]]) ; CHECK-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[X:%.*]], [[T2]] ; CHECK-NEXT: [[T3:%.*]] = and <8 x i32> [[TMP1]], ; CHECK-NEXT: ret <8 x i32> [[T3]] ; - %t0 = lshr <8 x i32> , %nbits + %t0 = lshr <8 x i32> , %nbits %t1 = and <8 x i32> %t0, %x - %t2 = add <8 x i32> %nbits, + %t2 = add <8 x i32> %nbits, call void @use8xi32(<8 x i32> %t0) call void @use8xi32(<8 x i32> %t2) %t3 = shl <8 x i32> %t1, %t2 ; shift is smaller than mask diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll index 9c096d1418a5b4..7ad99a6bb0a38f 100644 --- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll +++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll @@ -63,11 +63,11 @@ define <8 x i32> @t2_vec_splat(<8 x i32> %x, <8 x i32> %nbits) { ret <8 x i32> %t4 } -define <8 x i32> @t2_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) { -; CHECK-LABEL: @t2_vec_splat_undef( -; CHECK-NEXT: [[T0:%.*]] = shl <8 x i32> , [[NBITS:%.*]] +define <8 x i32> @t2_vec_splat_poison(<8 x i32> %x, <8 x i32> %nbits) { +; CHECK-LABEL: @t2_vec_splat_poison( +; CHECK-NEXT: [[T0:%.*]] = shl nsw <8 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = lshr <8 x i32> , [[NBITS]] -; CHECK-NEXT: [[T3:%.*]] = add <8 x i32> [[NBITS]], +; CHECK-NEXT: [[T3:%.*]] = add <8 x i32> [[NBITS]], ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T0]]) ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T1]]) ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[T3]]) @@ -75,10 +75,10 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) { ; CHECK-NEXT: [[T4:%.*]] = and <8 x i32> [[TMP1]], ; CHECK-NEXT: ret <8 x i32> [[T4]] ; - %t0 = shl <8 x i32> , %nbits + %t0 = shl <8 x i32> , %nbits %t1 = lshr <8 x i32> %t0, %nbits %t2 = and <8 x i32> %t1, %x - %t3 = add <8 x i32> %nbits, + %t3 = add <8 x i32> %nbits, call void @use8xi32(<8 x i32> %t0) call void @use8xi32(<8 x i32> %t1) call void @use8xi32(<8 x i32> %t3) diff --git a/llvm/test/Transforms/InstCombine/pr53357.ll b/llvm/test/Transforms/InstCombine/pr53357.ll index 0a6d2993ce46a8..0ae690869c1c44 100644 --- a/llvm/test/Transforms/InstCombine/pr53357.ll +++ b/llvm/test/Transforms/InstCombine/pr53357.ll @@ -30,16 +30,16 @@ define <2 x i32> @src_vec(<2 x i32> noundef %0, <2 x i32> noundef %1) { ret <2 x i32> %6 } -; vector version of src with undef values -define <2 x i32> @src_vec_undef(<2 x i32> noundef %0, <2 x i32> noundef %1) { -; CHECK-LABEL: @src_vec_undef( +; vector version of src with poison values +define <2 x i32> @src_vec_poison(<2 x i32> noundef %0, <2 x i32> noundef %1) { +; CHECK-LABEL: @src_vec_poison( ; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[TMP1:%.*]], [[TMP0:%.*]] ; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i32> [[TMP3]], ; CHECK-NEXT: ret <2 x i32> [[TMP4]] ; %3 = and <2 x i32> %1, %0 %4 = or <2 x i32> %1, %0 - %5 = xor <2 x i32> %4, + %5 = xor <2 x i32> %4, %6 = add <2 x i32> %3, %5 ret <2 x i32> %6 } diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll index d49cfe990d82d9..cb6775e689b8cb 100644 --- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll +++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll @@ -89,12 +89,12 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) { ret <8 x i32> %t6 } -define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { -; CHECK-LABEL: @t2_vec_splat_undef( +define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) { +; CHECK-LABEL: @t2_vec_splat_poison( ; CHECK-NEXT: [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64> -; CHECK-NEXT: [[T1:%.*]] = shl <8 x i64> , [[T0]] -; CHECK-NEXT: [[T2:%.*]] = xor <8 x i64> [[T1]], -; CHECK-NEXT: [[T3:%.*]] = sub <8 x i32> , [[NBITS]] +; CHECK-NEXT: [[T1:%.*]] = shl nsw <8 x i64> , [[T0]] +; CHECK-NEXT: [[T2:%.*]] = xor <8 x i64> [[T1]], +; CHECK-NEXT: [[T3:%.*]] = sub <8 x i32> , [[NBITS]] ; CHECK-NEXT: [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]] ; CHECK-NEXT: call void @use8xi32(<8 x i32> [[NBITS]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) @@ -107,9 +107,9 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: ret <8 x i32> [[T6]] ; %t0 = zext <8 x i32> %nbits to <8 x i64> - %t1 = shl <8 x i64> , %t0 - %t2 = xor <8 x i64> %t1, - %t3 = sub <8 x i32> , %nbits + %t1 = shl <8 x i64> , %t0 + %t2 = xor <8 x i64> %t1, + %t3 = sub <8 x i32> , %nbits %t4 = and <8 x i64> %t2, %x call void @use8xi32(<8 x i32> %nbits) diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-c.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-c.ll index fbbeffbba630b2..a78246781c7f9d 100644 --- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-c.ll +++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-c.ll @@ -77,11 +77,11 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) { ret <8 x i32> %t5 } -define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { -; CHECK-LABEL: @t2_vec_splat_undef( +define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) { +; CHECK-LABEL: @t2_vec_splat_poison( ; CHECK-NEXT: [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64> -; CHECK-NEXT: [[T1:%.*]] = lshr <8 x i64> , [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add <8 x i32> [[NBITS]], +; CHECK-NEXT: [[T1:%.*]] = lshr <8 x i64> , [[T0]] +; CHECK-NEXT: [[T2:%.*]] = add <8 x i32> [[NBITS]], ; CHECK-NEXT: [[T3:%.*]] = and <8 x i64> [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) @@ -92,8 +92,8 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: ret <8 x i32> [[T5]] ; %t0 = zext <8 x i32> %nbits to <8 x i64> - %t1 = lshr <8 x i64> , %t0 - %t2 = add <8 x i32> %nbits, + %t1 = lshr <8 x i64> , %t0 + %t2 = add <8 x i32> %nbits, %t3 = and <8 x i64> %t1, %x call void @use8xi64(<8 x i64> %t0) @@ -109,8 +109,8 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-LABEL: @t3_vec_nonsplat( ; CHECK-NEXT: [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64> -; CHECK-NEXT: [[T1:%.*]] = lshr <8 x i64> , [[T0]] -; CHECK-NEXT: [[T2:%.*]] = add <8 x i32> [[NBITS]], +; CHECK-NEXT: [[T1:%.*]] = lshr <8 x i64> , [[T0]] +; CHECK-NEXT: [[T2:%.*]] = add <8 x i32> [[NBITS]], ; CHECK-NEXT: [[T3:%.*]] = and <8 x i64> [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) @@ -121,8 +121,8 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: ret <8 x i32> [[T5]] ; %t0 = zext <8 x i32> %nbits to <8 x i64> - %t1 = lshr <8 x i64> , %t0 - %t2 = add <8 x i32> %nbits, + %t1 = lshr <8 x i64> , %t0 + %t2 = add <8 x i32> %nbits, %t3 = and <8 x i64> %t1, %x call void @use8xi64(<8 x i64> %t0) diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll index 1a977f67a6a5a8..b79ab790975270 100644 --- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll +++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll @@ -85,12 +85,12 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) { ret <8 x i32> %t6 } -define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { -; CHECK-LABEL: @t2_vec_splat_undef( +define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) { +; CHECK-LABEL: @t2_vec_splat_poison( ; CHECK-NEXT: [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64> -; CHECK-NEXT: [[T1:%.*]] = shl <8 x i64> , [[T0]] +; CHECK-NEXT: [[T1:%.*]] = shl nsw <8 x i64> , [[T0]] ; CHECK-NEXT: [[T2:%.*]] = lshr <8 x i64> , [[T0]] -; CHECK-NEXT: [[T3:%.*]] = add <8 x i32> [[NBITS]], +; CHECK-NEXT: [[T3:%.*]] = add <8 x i32> [[NBITS]], ; CHECK-NEXT: [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]] ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) @@ -102,9 +102,9 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: ret <8 x i32> [[T6]] ; %t0 = zext <8 x i32> %nbits to <8 x i64> - %t1 = shl <8 x i64> , %t0 + %t1 = shl <8 x i64> , %t0 %t2 = lshr <8 x i64> %t1, %t0 - %t3 = add <8 x i32> %nbits, + %t3 = add <8 x i32> %nbits, %t4 = and <8 x i64> %t2, %x call void @use8xi64(<8 x i64> %t0) @@ -121,9 +121,9 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) { define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-LABEL: @t3_vec_nonsplat( ; CHECK-NEXT: [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64> -; CHECK-NEXT: [[T1:%.*]] = shl <8 x i64> , [[T0]] +; CHECK-NEXT: [[T1:%.*]] = shl nsw <8 x i64> , [[T0]] ; CHECK-NEXT: [[T2:%.*]] = lshr <8 x i64> , [[T0]] -; CHECK-NEXT: [[T3:%.*]] = add <8 x i32> [[NBITS]], +; CHECK-NEXT: [[T3:%.*]] = add <8 x i32> [[NBITS]], ; CHECK-NEXT: [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]] ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T0]]) ; CHECK-NEXT: call void @use8xi64(<8 x i64> [[T1]]) @@ -135,9 +135,9 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) { ; CHECK-NEXT: ret <8 x i32> [[T6]] ; %t0 = zext <8 x i32> %nbits to <8 x i64> - %t1 = shl <8 x i64> , %t0 + %t1 = shl <8 x i64> , %t0 %t2 = lshr <8 x i64> %t1, %t0 - %t3 = add <8 x i32> %nbits, + %t3 = add <8 x i32> %nbits, %t4 = and <8 x i64> %t2, %x call void @use8xi64(<8 x i64> %t0) diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll index ddaef5f4b47c81..4b955a894fcfe6 100644 --- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll +++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll @@ -155,12 +155,12 @@ define <3 x i32> @t4_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) { ret <3 x i32> %t5 } -define <3 x i32> @t5_vec_undef(<3 x i32> %x, <3 x i32> %nbits) { -; CHECK-LABEL: @t5_vec_undef( -; CHECK-NEXT: [[T1:%.*]] = shl <3 x i32> , [[NBITS:%.*]] -; CHECK-NEXT: [[T2:%.*]] = xor <3 x i32> [[T1]], +define <3 x i32> @t5_vec_poison(<3 x i32> %x, <3 x i32> %nbits) { +; CHECK-LABEL: @t5_vec_poison( +; CHECK-NEXT: [[T1:%.*]] = shl nsw <3 x i32> , [[NBITS:%.*]] +; CHECK-NEXT: [[T2:%.*]] = xor <3 x i32> [[T1]], ; CHECK-NEXT: [[T3:%.*]] = and <3 x i32> [[T2]], [[X:%.*]] -; CHECK-NEXT: [[T4:%.*]] = sub <3 x i32> , [[NBITS]] +; CHECK-NEXT: [[T4:%.*]] = sub <3 x i32> , [[NBITS]] ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[NBITS]]) ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[T1]]) ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[T2]]) @@ -169,11 +169,11 @@ define <3 x i32> @t5_vec_undef(<3 x i32> %x, <3 x i32> %nbits) { ; CHECK-NEXT: [[T5:%.*]] = shl <3 x i32> [[X]], [[T4]] ; CHECK-NEXT: ret <3 x i32> [[T5]] ; - %t0 = add <3 x i32> %nbits, - %t1 = shl <3 x i32> , %t0 - %t2 = xor <3 x i32> %t1, + %t0 = add <3 x i32> %nbits, + %t1 = shl <3 x i32> , %t0 + %t2 = xor <3 x i32> %t1, %t3 = and <3 x i32> %t2, %x - %t4 = sub <3 x i32> , %nbits + %t4 = sub <3 x i32> , %nbits call void @use3xi32(<3 x i32> %t0) call void @use3xi32(<3 x i32> %t1) call void @use3xi32(<3 x i32> %t2) diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-c.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-c.ll index c7747cfafcff5f..8428ef67d6b86b 100644 --- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-c.ll +++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-c.ll @@ -99,20 +99,20 @@ define <3 x i32> @t3_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) { ret <3 x i32> %t3 } -define <3 x i32> @t4_vec_undef(<3 x i32> %x, <3 x i32> %nbits) { -; CHECK-LABEL: @t4_vec_undef( -; CHECK-NEXT: [[T0:%.*]] = lshr <3 x i32> , [[NBITS:%.*]] +define <3 x i32> @t4_vec_poison(<3 x i32> %x, <3 x i32> %nbits) { +; CHECK-LABEL: @t4_vec_poison( +; CHECK-NEXT: [[T0:%.*]] = lshr <3 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = and <3 x i32> [[T0]], [[X:%.*]] -; CHECK-NEXT: [[T2:%.*]] = add <3 x i32> [[NBITS]], +; CHECK-NEXT: [[T2:%.*]] = add <3 x i32> [[NBITS]], ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[T0]]) ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[T1]]) ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[T2]]) ; CHECK-NEXT: [[T3:%.*]] = shl <3 x i32> [[X]], [[T2]] ; CHECK-NEXT: ret <3 x i32> [[T3]] ; - %t0 = lshr <3 x i32> , %nbits + %t0 = lshr <3 x i32> , %nbits %t1 = and <3 x i32> %t0, %x - %t2 = add <3 x i32> %nbits, + %t2 = add <3 x i32> %nbits, call void @use3xi32(<3 x i32> %t0) call void @use3xi32(<3 x i32> %t1) call void @use3xi32(<3 x i32> %t2) diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll index 549729fe8b59c5..5d8ff9e9fb71bd 100644 --- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll +++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll @@ -115,9 +115,9 @@ define <3 x i32> @t3_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) { ret <3 x i32> %t4 } -define <3 x i32> @t4_vec_undef(<3 x i32> %x, <3 x i32> %nbits) { -; CHECK-LABEL: @t4_vec_undef( -; CHECK-NEXT: [[T0:%.*]] = shl <3 x i32> , [[NBITS:%.*]] +define <3 x i32> @t4_vec_poison(<3 x i32> %x, <3 x i32> %nbits) { +; CHECK-LABEL: @t4_vec_poison( +; CHECK-NEXT: [[T0:%.*]] = shl nsw <3 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[T1:%.*]] = lshr <3 x i32> , [[NBITS]] ; CHECK-NEXT: [[T2:%.*]] = and <3 x i32> [[T1]], [[X:%.*]] ; CHECK-NEXT: call void @use3xi32(<3 x i32> [[T0]]) @@ -127,10 +127,10 @@ define <3 x i32> @t4_vec_undef(<3 x i32> %x, <3 x i32> %nbits) { ; CHECK-NEXT: [[T4:%.*]] = shl <3 x i32> [[X]], [[NBITS]] ; CHECK-NEXT: ret <3 x i32> [[T4]] ; - %t0 = shl <3 x i32> , %nbits + %t0 = shl <3 x i32> , %nbits %t1 = lshr <3 x i32> %t0, %nbits %t2 = and <3 x i32> %t1, %x - %t3 = add <3 x i32> %nbits, + %t3 = add <3 x i32> %nbits, call void @use3xi32(<3 x i32> %t0) call void @use3xi32(<3 x i32> %t1) call void @use3xi32(<3 x i32> %t2) diff --git a/llvm/test/Transforms/InstCombine/reuse-constant-from-select-in-icmp.ll b/llvm/test/Transforms/InstCombine/reuse-constant-from-select-in-icmp.ll index fd0d942ad840b6..301ead708a08f7 100644 --- a/llvm/test/Transforms/InstCombine/reuse-constant-from-select-in-icmp.ll +++ b/llvm/test/Transforms/InstCombine/reuse-constant-from-select-in-icmp.ll @@ -102,36 +102,36 @@ define <2 x i32> @p7_vec_splat_sgt(<2 x i32> %x, <2 x i32> %y) { ret <2 x i32> %r } -; Vectors with undef +; Vectors with poison -define <2 x i32> @p8_vec_nonsplat_undef0(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @p8_vec_nonsplat_undef0( +define <2 x i32> @p8_vec_nonsplat_poison0(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @p8_vec_nonsplat_poison0( ; CHECK-NEXT: [[T_INV:%.*]] = icmp ugt <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[T_INV]], <2 x i32> , <2 x i32> [[Y:%.*]] ; CHECK-NEXT: ret <2 x i32> [[R]] ; - %t = icmp ult <2 x i32> %x, + %t = icmp ult <2 x i32> %x, %r = select <2 x i1> %t, <2 x i32> %y, <2 x i32> ret <2 x i32> %r } -define <2 x i32> @p9_vec_nonsplat_undef1(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @p9_vec_nonsplat_undef1( +define <2 x i32> @p9_vec_nonsplat_poison1(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @p9_vec_nonsplat_poison1( ; CHECK-NEXT: [[T_INV:%.*]] = icmp ugt <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[T_INV]], <2 x i32> , <2 x i32> [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[T_INV]], <2 x i32> , <2 x i32> [[Y:%.*]] ; CHECK-NEXT: ret <2 x i32> [[R]] ; %t = icmp ult <2 x i32> %x, - %r = select <2 x i1> %t, <2 x i32> %y, <2 x i32> + %r = select <2 x i1> %t, <2 x i32> %y, <2 x i32> ret <2 x i32> %r } -define <2 x i32> @p10_vec_nonsplat_undef2(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @p10_vec_nonsplat_undef2( +define <2 x i32> @p10_vec_nonsplat_poison2(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @p10_vec_nonsplat_poison2( ; CHECK-NEXT: [[T_INV:%.*]] = icmp ugt <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[T_INV]], <2 x i32> , <2 x i32> [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[T_INV]], <2 x i32> , <2 x i32> [[Y:%.*]] ; CHECK-NEXT: ret <2 x i32> [[R]] ; - %t = icmp ult <2 x i32> %x, - %r = select <2 x i1> %t, <2 x i32> %y, <2 x i32> + %t = icmp ult <2 x i32> %x, + %r = select <2 x i1> %t, <2 x i32> %y, <2 x i32> ret <2 x i32> %r } diff --git a/llvm/test/Transforms/InstCombine/rotate.ll b/llvm/test/Transforms/InstCombine/rotate.ll index 6c70c791fd881c..eec623e2f193a9 100644 --- a/llvm/test/Transforms/InstCombine/rotate.ll +++ b/llvm/test/Transforms/InstCombine/rotate.ll @@ -65,24 +65,24 @@ define <2 x i16> @rotl_v2i16_constant_splat(<2 x i16> %x) { ret <2 x i16> %r } -define <2 x i16> @rotl_v2i16_constant_splat_undef0(<2 x i16> %x) { -; CHECK-LABEL: @rotl_v2i16_constant_splat_undef0( +define <2 x i16> @rotl_v2i16_constant_splat_poison0(<2 x i16> %x) { +; CHECK-LABEL: @rotl_v2i16_constant_splat_poison0( ; CHECK-NEXT: [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[X]], <2 x i16> ) ; CHECK-NEXT: ret <2 x i16> [[R]] ; - %shl = shl <2 x i16> %x, + %shl = shl <2 x i16> %x, %shr = lshr <2 x i16> %x, %r = or <2 x i16> %shl, %shr ret <2 x i16> %r } -define <2 x i16> @rotl_v2i16_constant_splat_undef1(<2 x i16> %x) { -; CHECK-LABEL: @rotl_v2i16_constant_splat_undef1( +define <2 x i16> @rotl_v2i16_constant_splat_poison1(<2 x i16> %x) { +; CHECK-LABEL: @rotl_v2i16_constant_splat_poison1( ; CHECK-NEXT: [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[X]], <2 x i16> ) ; CHECK-NEXT: ret <2 x i16> [[R]] ; %shl = shl <2 x i16> %x, - %shr = lshr <2 x i16> %x, + %shr = lshr <2 x i16> %x, %r = or <2 x i16> %shl, %shr ret <2 x i16> %r } @@ -100,30 +100,30 @@ define <2 x i17> @rotr_v2i17_constant_splat(<2 x i17> %x) { ret <2 x i17> %r } -define <2 x i17> @rotr_v2i17_constant_splat_undef0(<2 x i17> %x) { -; CHECK-LABEL: @rotr_v2i17_constant_splat_undef0( +define <2 x i17> @rotr_v2i17_constant_splat_poison0(<2 x i17> %x) { +; CHECK-LABEL: @rotr_v2i17_constant_splat_poison0( ; CHECK-NEXT: [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[X:%.*]], <2 x i17> [[X]], <2 x i17> ) ; CHECK-NEXT: ret <2 x i17> [[R]] ; - %shl = shl <2 x i17> %x, - %shr = lshr <2 x i17> %x, + %shl = shl <2 x i17> %x, + %shr = lshr <2 x i17> %x, %r = or <2 x i17> %shr, %shl ret <2 x i17> %r } -define <2 x i17> @rotr_v2i17_constant_splat_undef1(<2 x i17> %x) { -; CHECK-LABEL: @rotr_v2i17_constant_splat_undef1( +define <2 x i17> @rotr_v2i17_constant_splat_poison1(<2 x i17> %x) { +; CHECK-LABEL: @rotr_v2i17_constant_splat_poison1( ; CHECK-NEXT: [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[X:%.*]], <2 x i17> [[X]], <2 x i17> ) ; CHECK-NEXT: ret <2 x i17> [[R]] ; - %shl = shl <2 x i17> %x, - %shr = lshr <2 x i17> %x, + %shl = shl <2 x i17> %x, + %shr = lshr <2 x i17> %x, %r = or <2 x i17> %shr, %shl ret <2 x i17> %r } ; Allow arbitrary shift constants. -; Support undef elements. +; Support poison elements. define <2 x i32> @rotr_v2i32_constant_nonsplat(<2 x i32> %x) { ; CHECK-LABEL: @rotr_v2i32_constant_nonsplat( @@ -136,17 +136,6 @@ define <2 x i32> @rotr_v2i32_constant_nonsplat(<2 x i32> %x) { ret <2 x i32> %r } -define <2 x i32> @rotr_v2i32_constant_nonsplat_undef0(<2 x i32> %x) { -; CHECK-LABEL: @rotr_v2i32_constant_nonsplat_undef0( -; CHECK-NEXT: [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[X:%.*]], <2 x i32> [[X]], <2 x i32> ) -; CHECK-NEXT: ret <2 x i32> [[R]] -; - %shl = shl <2 x i32> %x, - %shr = lshr <2 x i32> %x, - %r = or <2 x i32> %shl, %shr - ret <2 x i32> %r -} - define <2 x i32> @rotr_v2i32_constant_nonsplat_poison0(<2 x i32> %x) { ; CHECK-LABEL: @rotr_v2i32_constant_nonsplat_poison0( ; CHECK-NEXT: [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[X:%.*]], <2 x i32> [[X]], <2 x i32> ) @@ -158,13 +147,13 @@ define <2 x i32> @rotr_v2i32_constant_nonsplat_poison0(<2 x i32> %x) { ret <2 x i32> %r } -define <2 x i32> @rotr_v2i32_constant_nonsplat_undef1(<2 x i32> %x) { -; CHECK-LABEL: @rotr_v2i32_constant_nonsplat_undef1( +define <2 x i32> @rotr_v2i32_constant_nonsplat_poison1(<2 x i32> %x) { +; CHECK-LABEL: @rotr_v2i32_constant_nonsplat_poison1( ; CHECK-NEXT: [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[X:%.*]], <2 x i32> [[X]], <2 x i32> ) ; CHECK-NEXT: ret <2 x i32> [[R]] ; %shl = shl <2 x i32> %x, - %shr = lshr <2 x i32> %x, + %shr = lshr <2 x i32> %x, %r = or <2 x i32> %shl, %shr ret <2 x i32> %r } @@ -180,13 +169,13 @@ define <2 x i36> @rotl_v2i36_constant_nonsplat(<2 x i36> %x) { ret <2 x i36> %r } -define <3 x i36> @rotl_v3i36_constant_nonsplat_undef0(<3 x i36> %x) { -; CHECK-LABEL: @rotl_v3i36_constant_nonsplat_undef0( -; CHECK-NEXT: [[R:%.*]] = call <3 x i36> @llvm.fshl.v3i36(<3 x i36> [[X:%.*]], <3 x i36> [[X]], <3 x i36> ) +define <3 x i36> @rotl_v3i36_constant_nonsplat_poison0(<3 x i36> %x) { +; CHECK-LABEL: @rotl_v3i36_constant_nonsplat_poison0( +; CHECK-NEXT: [[R:%.*]] = call <3 x i36> @llvm.fshl.v3i36(<3 x i36> [[X:%.*]], <3 x i36> [[X]], <3 x i36> ) ; CHECK-NEXT: ret <3 x i36> [[R]] ; - %shl = shl <3 x i36> %x, - %shr = lshr <3 x i36> %x, + %shl = shl <3 x i36> %x, + %shr = lshr <3 x i36> %x, %r = or <3 x i36> %shl, %shr ret <3 x i36> %r } diff --git a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll index c1bb6941d45683..57977a72cd08fd 100644 --- a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll +++ b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll @@ -559,14 +559,14 @@ define <2 x i8> @test_simplify_decrement_vec(<2 x i8> %a) { ret <2 x i8> %i2 } -define <2 x i8> @test_simplify_decrement_vec_undef(<2 x i8> %a) { -; CHECK-LABEL: @test_simplify_decrement_vec_undef( +define <2 x i8> @test_simplify_decrement_vec_poison(<2 x i8> %a) { +; CHECK-LABEL: @test_simplify_decrement_vec_poison( ; CHECK-NEXT: [[I2:%.*]] = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> ) ; CHECK-NEXT: ret <2 x i8> [[I2]] ; %i = icmp eq <2 x i8> %a, %i1 = sub <2 x i8> %a, - %i2 = select <2 x i1> %i, <2 x i8> , <2 x i8> %i1 + %i2 = select <2 x i1> %i, <2 x i8> , <2 x i8> %i1 ret <2 x i8> %i2 } @@ -1818,14 +1818,14 @@ define <4 x i32> @uadd_sat_constant_vec_commute(<4 x i32> %x) { define <4 x i32> @uadd_sat_constant_vec_commute_undefs(<4 x i32> %x) { ; CHECK-LABEL: @uadd_sat_constant_vec_commute_undefs( -; CHECK-NEXT: [[A:%.*]] = add <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[C:%.*]] = icmp ult <4 x i32> [[X]], -; CHECK-NEXT: [[R:%.*]] = select <4 x i1> [[C]], <4 x i32> [[A]], <4 x i32> +; CHECK-NEXT: [[A:%.*]] = add <4 x i32> [[X:%.*]], +; CHECK-NEXT: [[C:%.*]] = icmp ult <4 x i32> [[X]], +; CHECK-NEXT: [[R:%.*]] = select <4 x i1> [[C]], <4 x i32> [[A]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; - %a = add <4 x i32> %x, - %c = icmp ult <4 x i32> %x, - %r = select <4 x i1> %c, <4 x i32> %a, <4 x i32> + %a = add <4 x i32> %x, + %c = icmp ult <4 x i32> %x, + %r = select <4 x i1> %c, <4 x i32> %a, <4 x i32> ret <4 x i32> %r } diff --git a/llvm/test/Transforms/InstCombine/select-of-bittest.ll b/llvm/test/Transforms/InstCombine/select-of-bittest.ll index a6f14cbfbfadf7..e3eb76de459e23 100644 --- a/llvm/test/Transforms/InstCombine/select-of-bittest.ll +++ b/llvm/test/Transforms/InstCombine/select-of-bittest.ll @@ -80,19 +80,18 @@ define <2 x i32> @and_lshr_and_vec_v2(<2 x i32> %arg) { ret <2 x i32> %t4 } -define <3 x i32> @and_lshr_and_vec_undef(<3 x i32> %arg) { -; CHECK-LABEL: @and_lshr_and_vec_undef( +define <3 x i32> @and_lshr_and_vec_poison(<3 x i32> %arg) { +; CHECK-LABEL: @and_lshr_and_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = and <3 x i32> [[ARG:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <3 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[T4:%.*]] = zext <3 x i1> [[TMP2]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[T4]] ; - %t = and <3 x i32> %arg, - %t1 = icmp eq <3 x i32> %t, - %t2 = lshr <3 x i32> %arg, - %t3 = and <3 x i32> %t2, - ; The second element of %t4 is poison because it is (undef ? poison : undef). - %t4 = select <3 x i1> %t1, <3 x i32> %t3, <3 x i32> + %t = and <3 x i32> %arg, + %t1 = icmp eq <3 x i32> %t, + %t2 = lshr <3 x i32> %arg, + %t3 = and <3 x i32> %t2, + %t4 = select <3 x i1> %t1, <3 x i32> %t3, <3 x i32> ret <3 x i32> %t4 } @@ -138,17 +137,17 @@ define <2 x i32> @and_and_vec(<2 x i32> %arg) { ret <2 x i32> %t3 } -define <3 x i32> @and_and_vec_undef(<3 x i32> %arg) { -; CHECK-LABEL: @and_and_vec_undef( -; CHECK-NEXT: [[TMP1:%.*]] = and <3 x i32> [[ARG:%.*]], +define <3 x i32> @and_and_vec_poison(<3 x i32> %arg) { +; CHECK-LABEL: @and_and_vec_poison( +; CHECK-NEXT: [[TMP1:%.*]] = and <3 x i32> [[ARG:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <3 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[T3:%.*]] = zext <3 x i1> [[TMP2]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[T3]] ; - %t = and <3 x i32> %arg, - %t1 = icmp eq <3 x i32> %t, - %t2 = and <3 x i32> %arg, - %t3 = select <3 x i1> %t1, <3 x i32> %t2, <3 x i32> + %t = and <3 x i32> %arg, + %t1 = icmp eq <3 x i32> %t, + %t2 = and <3 x i32> %arg, + %t3 = select <3 x i1> %t1, <3 x i32> %t2, <3 x i32> ret <3 x i32> %t3 } @@ -221,8 +220,8 @@ define <2 x i32> @f_var0_vec(<2 x i32> %arg, <2 x i32> %arg1) { ret <2 x i32> %t5 } -define <3 x i32> @f_var0_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) { -; CHECK-LABEL: @f_var0_vec_undef( +define <3 x i32> @f_var0_vec_poison(<3 x i32> %arg, <3 x i32> %arg1) { +; CHECK-LABEL: @f_var0_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = or <3 x i32> [[ARG1:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[ARG:%.*]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer @@ -230,11 +229,11 @@ define <3 x i32> @f_var0_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) { ; CHECK-NEXT: ret <3 x i32> [[T5]] ; %t = and <3 x i32> %arg, %arg1 - %t2 = icmp eq <3 x i32> %t, - %t3 = lshr <3 x i32> %arg, - %t4 = and <3 x i32> %t3, - ; The second element of %t5 is poison because it is (undef ? poison : undef). - %t5 = select <3 x i1> %t2, <3 x i32> %t4, <3 x i32> + %t2 = icmp eq <3 x i32> %t, + %t3 = lshr <3 x i32> %arg, + %t4 = and <3 x i32> %t3, + ; The second element of %t5 is poison because it is (poison ? poison : poison). + %t5 = select <3 x i1> %t2, <3 x i32> %t4, <3 x i32> ret <3 x i32> %t5 } @@ -284,8 +283,8 @@ define <2 x i32> @f_var1_vec(<2 x i32> %arg, <2 x i32> %arg1) { ret <2 x i32> %t4 } -define <3 x i32> @f_var1_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) { -; CHECK-LABEL: @f_var1_vec_undef( +define <3 x i32> @f_var1_vec_poison(<3 x i32> %arg, <3 x i32> %arg1) { +; CHECK-LABEL: @f_var1_vec_poison( ; CHECK-NEXT: [[TMP1:%.*]] = or <3 x i32> [[ARG1:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[ARG:%.*]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer @@ -293,9 +292,9 @@ define <3 x i32> @f_var1_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) { ; CHECK-NEXT: ret <3 x i32> [[T4]] ; %t = and <3 x i32> %arg, %arg1 - %t2 = icmp eq <3 x i32> %t, - %t3 = and <3 x i32> %arg, - %t4 = select <3 x i1> %t2, <3 x i32> %t3, <3 x i32> + %t2 = icmp eq <3 x i32> %t, + %t3 = and <3 x i32> %arg, + %t4 = select <3 x i1> %t2, <3 x i32> %t3, <3 x i32> ret <3 x i32> %t4 } @@ -354,20 +353,20 @@ define <2 x i32> @f_var2_vec(<2 x i32> %arg, <2 x i32> %arg1) { ret <2 x i32> %t5 } -define <3 x i32> @f_var2_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) { -; CHECK-LABEL: @f_var2_vec_undef( -; CHECK-NEXT: [[T:%.*]] = and <3 x i32> [[ARG:%.*]], -; CHECK-NEXT: [[T2:%.*]] = icmp eq <3 x i32> [[T]], +define <3 x i32> @f_var2_vec_poison(<3 x i32> %arg, <3 x i32> %arg1) { +; CHECK-LABEL: @f_var2_vec_poison( +; CHECK-NEXT: [[T:%.*]] = and <3 x i32> [[ARG:%.*]], +; CHECK-NEXT: [[T2:%.*]] = icmp eq <3 x i32> [[T]], ; CHECK-NEXT: [[T3:%.*]] = lshr <3 x i32> [[ARG]], [[ARG1:%.*]] -; CHECK-NEXT: [[T4:%.*]] = and <3 x i32> [[T3]], -; CHECK-NEXT: [[T5:%.*]] = select <3 x i1> [[T2]], <3 x i32> [[T4]], <3 x i32> +; CHECK-NEXT: [[T4:%.*]] = and <3 x i32> [[T3]], +; CHECK-NEXT: [[T5:%.*]] = select <3 x i1> [[T2]], <3 x i32> [[T4]], <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[T5]] ; - %t = and <3 x i32> %arg, - %t2 = icmp eq <3 x i32> %t, + %t = and <3 x i32> %arg, + %t2 = icmp eq <3 x i32> %t, %t3 = lshr <3 x i32> %arg, %arg1 - %t4 = and <3 x i32> %t3, - %t5 = select <3 x i1> %t2, <3 x i32> %t4, <3 x i32> + %t4 = and <3 x i32> %t3, + %t5 = select <3 x i1> %t2, <3 x i32> %t4, <3 x i32> ret <3 x i32> %t5 } @@ -427,20 +426,20 @@ define <2 x i32> @f_var3_splatvec(<2 x i32> %arg, <2 x i32> %arg1, <2 x i32> %ar ret <2 x i32> %t6 } -define <3 x i32> @f_var3_vec_undef(<3 x i32> %arg, <3 x i32> %arg1, <3 x i32> %arg2) { -; CHECK-LABEL: @f_var3_vec_undef( +define <3 x i32> @f_var3_vec_poison(<3 x i32> %arg, <3 x i32> %arg1, <3 x i32> %arg2) { +; CHECK-LABEL: @f_var3_vec_poison( ; CHECK-NEXT: [[T:%.*]] = and <3 x i32> [[ARG:%.*]], [[ARG1:%.*]] -; CHECK-NEXT: [[T3:%.*]] = icmp eq <3 x i32> [[T]], +; CHECK-NEXT: [[T3:%.*]] = icmp eq <3 x i32> [[T]], ; CHECK-NEXT: [[T4:%.*]] = lshr <3 x i32> [[ARG]], [[ARG2:%.*]] -; CHECK-NEXT: [[T5:%.*]] = and <3 x i32> [[T4]], -; CHECK-NEXT: [[T6:%.*]] = select <3 x i1> [[T3]], <3 x i32> [[T5]], <3 x i32> +; CHECK-NEXT: [[T5:%.*]] = and <3 x i32> [[T4]], +; CHECK-NEXT: [[T6:%.*]] = select <3 x i1> [[T3]], <3 x i32> [[T5]], <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[T6]] ; %t = and <3 x i32> %arg, %arg1 - %t3 = icmp eq <3 x i32> %t, + %t3 = icmp eq <3 x i32> %t, %t4 = lshr <3 x i32> %arg, %arg2 - %t5 = and <3 x i32> %t4, - %t6 = select <3 x i1> %t3, <3 x i32> %t5, <3 x i32> + %t5 = and <3 x i32> %t4, + %t6 = select <3 x i1> %t3, <3 x i32> %t5, <3 x i32> ret <3 x i32> %t6 } diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index bd8145ab2a35bc..8654691c6f875f 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -3109,45 +3109,46 @@ define <4 x i32> @mul_select_eq_zero_vector(<4 x i32> %x, <4 x i32> %y) { } ; Check that a select is folded into multiplication if condition's operand -; is a vector consisting of zeros and undefs. -; select ( x == {0, undef, ...}), 0, x * y --> freeze(y) * x -define <2 x i32> @mul_select_eq_undef_vector(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @mul_select_eq_undef_vector( -; CHECK-NEXT: [[Y_FR:%.*]] = freeze <2 x i32> [[Y:%.*]] +; is a vector consisting of zeros and poisons. +; select ( x == {0, poison, ...}), 0, x * y --> freeze(y) * x +define <2 x i32> @mul_select_eq_poison_vector(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @mul_select_eq_poison_vector( +; CHECK-NEXT: [[C:%.*]] = icmp eq <2 x i32> [[Y_FR:%.*]], ; CHECK-NEXT: [[M:%.*]] = mul <2 x i32> [[Y_FR]], [[X:%.*]] -; CHECK-NEXT: ret <2 x i32> [[M]] +; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[C]], <2 x i32> , <2 x i32> [[M]] +; CHECK-NEXT: ret <2 x i32> [[R]] ; - %c = icmp eq <2 x i32> %x, + %c = icmp eq <2 x i32> %x, %m = mul <2 x i32> %x, %y %r = select <2 x i1> %c, <2 x i32> , <2 x i32> %m ret <2 x i32> %r } ; Check that a select is folded into multiplication if other select's operand -; is a vector consisting of zeros and undefs. -; select ( x == 0), {0, undef, ...}, x * y --> freeze(y) * x -define <2 x i32> @mul_select_eq_zero_sel_undef_vector(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @mul_select_eq_zero_sel_undef_vector( +; is a vector consisting of zeros and poisons. +; select ( x == 0), {0, poison, ...}, x * y --> freeze(y) * x +define <2 x i32> @mul_select_eq_zero_sel_poison_vector(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @mul_select_eq_zero_sel_poison_vector( ; CHECK-NEXT: [[Y_FR:%.*]] = freeze <2 x i32> [[Y:%.*]] ; CHECK-NEXT: [[M:%.*]] = mul <2 x i32> [[Y_FR]], [[X:%.*]] ; CHECK-NEXT: ret <2 x i32> [[M]] ; %c = icmp eq <2 x i32> %x, zeroinitializer %m = mul <2 x i32> %x, %y - %r = select <2 x i1> %c, <2 x i32> , <2 x i32> %m + %r = select <2 x i1> %c, <2 x i32> , <2 x i32> %m ret <2 x i32> %r } ; Negative test: select should not be folded into mul because ; condition's operand and select's operand do not merge into zero vector. -define <2 x i32> @mul_select_eq_undef_vector_not_merging_to_zero(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @mul_select_eq_undef_vector_not_merging_to_zero( -; CHECK-NEXT: [[C:%.*]] = icmp eq <2 x i32> [[X:%.*]], +define <2 x i32> @mul_select_eq_poison_vector_not_merging_to_zero(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @mul_select_eq_poison_vector_not_merging_to_zero( +; CHECK-NEXT: [[C:%.*]] = icmp eq <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[M:%.*]] = mul <2 x i32> [[X]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[C]], <2 x i32> , <2 x i32> [[M]] ; CHECK-NEXT: ret <2 x i32> [[R]] ; - %c = icmp eq <2 x i32> %x, + %c = icmp eq <2 x i32> %x, %m = mul <2 x i32> %x, %y %r = select <2 x i1> %c, <2 x i32> , <2 x i32> %m ret <2 x i32> %r diff --git a/llvm/test/Transforms/InstCombine/select_meta.ll b/llvm/test/Transforms/InstCombine/select_meta.ll index aa794e82e0fdc6..3898fd9fa1f578 100644 --- a/llvm/test/Transforms/InstCombine/select_meta.ll +++ b/llvm/test/Transforms/InstCombine/select_meta.ll @@ -301,15 +301,15 @@ define <2 x i32> @not_cond_vec(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) { ret <2 x i32> %r } -; Should match vector 'not' with undef element. +; Should match vector 'not' with poison element. ; The condition is inverted, and the select ops are swapped. The metadata should be swapped. -define <2 x i32> @not_cond_vec_undef(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) { -; CHECK-LABEL: @not_cond_vec_undef( +define <2 x i32> @not_cond_vec_poison(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) { +; CHECK-LABEL: @not_cond_vec_poison( ; CHECK-NEXT: [[R:%.*]] = select <2 x i1> [[C:%.*]], <2 x i32> [[FV:%.*]], <2 x i32> [[TV:%.*]], !prof [[PROF1]] ; CHECK-NEXT: ret <2 x i32> [[R]] ; - %notc = xor <2 x i1> %c, + %notc = xor <2 x i1> %c, %r = select <2 x i1> %notc, <2 x i32> %tv, <2 x i32> %fv, !prof !1 ret <2 x i32> %r } diff --git a/llvm/test/Transforms/InstCombine/set-lowbits-mask-canonicalize.ll b/llvm/test/Transforms/InstCombine/set-lowbits-mask-canonicalize.ll index 3ee0224eb1d03a..a3c8d3393d04fa 100644 --- a/llvm/test/Transforms/InstCombine/set-lowbits-mask-canonicalize.ll +++ b/llvm/test/Transforms/InstCombine/set-lowbits-mask-canonicalize.ll @@ -196,36 +196,36 @@ define <2 x i32> @shl_add_vec(<2 x i32> %NBits) { ret <2 x i32> %ret } -define <3 x i32> @shl_add_vec_undef0(<3 x i32> %NBits) { -; CHECK-LABEL: @shl_add_vec_undef0( +define <3 x i32> @shl_add_vec_poison0(<3 x i32> %NBits) { +; CHECK-LABEL: @shl_add_vec_poison0( ; CHECK-NEXT: [[NOTMASK:%.*]] = shl nsw <3 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[RET:%.*]] = xor <3 x i32> [[NOTMASK]], ; CHECK-NEXT: ret <3 x i32> [[RET]] ; - %setbit = shl <3 x i32> , %NBits + %setbit = shl <3 x i32> , %NBits %ret = add <3 x i32> %setbit, ret <3 x i32> %ret } -define <3 x i32> @shl_add_vec_undef1(<3 x i32> %NBits) { -; CHECK-LABEL: @shl_add_vec_undef1( +define <3 x i32> @shl_add_vec_poison1(<3 x i32> %NBits) { +; CHECK-LABEL: @shl_add_vec_poison1( ; CHECK-NEXT: [[NOTMASK:%.*]] = shl nsw <3 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[RET:%.*]] = xor <3 x i32> [[NOTMASK]], ; CHECK-NEXT: ret <3 x i32> [[RET]] ; %setbit = shl <3 x i32> , %NBits - %ret = add <3 x i32> %setbit, + %ret = add <3 x i32> %setbit, ret <3 x i32> %ret } -define <3 x i32> @shl_add_vec_undef2(<3 x i32> %NBits) { -; CHECK-LABEL: @shl_add_vec_undef2( +define <3 x i32> @shl_add_vec_poison2(<3 x i32> %NBits) { +; CHECK-LABEL: @shl_add_vec_poison2( ; CHECK-NEXT: [[NOTMASK:%.*]] = shl nsw <3 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[RET:%.*]] = xor <3 x i32> [[NOTMASK]], ; CHECK-NEXT: ret <3 x i32> [[RET]] ; - %setbit = shl <3 x i32> , %NBits - %ret = add <3 x i32> %setbit, + %setbit = shl <3 x i32> , %NBits + %ret = add <3 x i32> %setbit, ret <3 x i32> %ret } diff --git a/llvm/test/Transforms/InstCombine/sext.ll b/llvm/test/Transforms/InstCombine/sext.ll index e3b6058ce7f806..6d263cfcda0577 100644 --- a/llvm/test/Transforms/InstCombine/sext.ll +++ b/llvm/test/Transforms/InstCombine/sext.ll @@ -167,39 +167,39 @@ define <2 x i32> @test10_vec_nonuniform(<2 x i32> %i) { ret <2 x i32> %D } -define <2 x i32> @test10_vec_undef0(<2 x i32> %i) { -; CHECK-LABEL: @test10_vec_undef0( -; CHECK-NEXT: [[D1:%.*]] = shl <2 x i32> [[I:%.*]], -; CHECK-NEXT: [[D:%.*]] = ashr exact <2 x i32> [[D1]], +define <2 x i32> @test10_vec_poison0(<2 x i32> %i) { +; CHECK-LABEL: @test10_vec_poison0( +; CHECK-NEXT: [[D1:%.*]] = shl <2 x i32> [[I:%.*]], +; CHECK-NEXT: [[D:%.*]] = ashr exact <2 x i32> [[D1]], ; CHECK-NEXT: ret <2 x i32> [[D]] ; %A = trunc <2 x i32> %i to <2 x i8> %B = shl <2 x i8> %A, - %C = ashr <2 x i8> %B, + %C = ashr <2 x i8> %B, %D = sext <2 x i8> %C to <2 x i32> ret <2 x i32> %D } -define <2 x i32> @test10_vec_undef1(<2 x i32> %i) { -; CHECK-LABEL: @test10_vec_undef1( +define <2 x i32> @test10_vec_poison1(<2 x i32> %i) { +; CHECK-LABEL: @test10_vec_poison1( ; CHECK-NEXT: [[D1:%.*]] = shl <2 x i32> [[I:%.*]], ; CHECK-NEXT: [[D:%.*]] = ashr exact <2 x i32> [[D1]], ; CHECK-NEXT: ret <2 x i32> [[D]] ; %A = trunc <2 x i32> %i to <2 x i8> - %B = shl <2 x i8> %A, + %B = shl <2 x i8> %A, %C = ashr <2 x i8> %B, %D = sext <2 x i8> %C to <2 x i32> ret <2 x i32> %D } -define <2 x i32> @test10_vec_undef2(<2 x i32> %i) { -; CHECK-LABEL: @test10_vec_undef2( -; CHECK-NEXT: [[D1:%.*]] = shl <2 x i32> [[I:%.*]], -; CHECK-NEXT: [[D:%.*]] = ashr exact <2 x i32> [[D1]], +define <2 x i32> @test10_vec_poison2(<2 x i32> %i) { +; CHECK-LABEL: @test10_vec_poison2( +; CHECK-NEXT: [[D1:%.*]] = shl <2 x i32> [[I:%.*]], +; CHECK-NEXT: [[D:%.*]] = ashr exact <2 x i32> [[D1]], ; CHECK-NEXT: ret <2 x i32> [[D]] ; %A = trunc <2 x i32> %i to <2 x i8> - %B = shl <2 x i8> %A, - %C = ashr <2 x i8> %B, + %B = shl <2 x i8> %A, + %C = ashr <2 x i8> %B, %D = sext <2 x i8> %C to <2 x i32> ret <2 x i32> %D } diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest.ll index 0262db1a01e5cf..96d429c62a88f7 100644 --- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest.ll +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest.ll @@ -143,34 +143,34 @@ define <2 x i1> @t8_const_lshr_shl_ne_vec_nonsplat(<2 x i32> %x, <2 x i32> %y) { %t3 = icmp ne <2 x i32> %t2, ret <2 x i1> %t3 } -define <3 x i1> @t9_const_lshr_shl_ne_vec_undef0(<3 x i32> %x, <3 x i32> %y) { -; CHECK-LABEL: @t9_const_lshr_shl_ne_vec_undef0( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +define <3 x i1> @t9_const_lshr_shl_ne_vec_poison0(<3 x i32> %x, <3 x i32> %y) { +; CHECK-LABEL: @t9_const_lshr_shl_ne_vec_poison0( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[T3]] ; - %t0 = lshr <3 x i32> %x, + %t0 = lshr <3 x i32> %x, %t1 = shl <3 x i32> %y, %t2 = and <3 x i32> %t1, %t0 %t3 = icmp ne <3 x i32> %t2, ret <3 x i1> %t3 } -define <3 x i1> @t10_const_lshr_shl_ne_vec_undef1(<3 x i32> %x, <3 x i32> %y) { -; CHECK-LABEL: @t10_const_lshr_shl_ne_vec_undef1( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +define <3 x i1> @t10_const_lshr_shl_ne_vec_poison1(<3 x i32> %x, <3 x i32> %y) { +; CHECK-LABEL: @t10_const_lshr_shl_ne_vec_poison1( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[T3]] ; %t0 = lshr <3 x i32> %x, - %t1 = shl <3 x i32> %y, + %t1 = shl <3 x i32> %y, %t2 = and <3 x i32> %t1, %t0 %t3 = icmp ne <3 x i32> %t2, ret <3 x i1> %t3 } -define <3 x i1> @t11_const_lshr_shl_ne_vec_undef2(<3 x i32> %x, <3 x i32> %y) { -; CHECK-LABEL: @t11_const_lshr_shl_ne_vec_undef2( +define <3 x i1> @t11_const_lshr_shl_ne_vec_poison2(<3 x i32> %x, <3 x i32> %y) { +; CHECK-LABEL: @t11_const_lshr_shl_ne_vec_poison2( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer @@ -179,59 +179,59 @@ define <3 x i1> @t11_const_lshr_shl_ne_vec_undef2(<3 x i32> %x, <3 x i32> %y) { %t0 = lshr <3 x i32> %x, %t1 = shl <3 x i32> %y, %t2 = and <3 x i32> %t1, %t0 - %t3 = icmp ne <3 x i32> %t2, + %t3 = icmp ne <3 x i32> %t2, ret <3 x i1> %t3 } -define <3 x i1> @t12_const_lshr_shl_ne_vec_undef3(<3 x i32> %x, <3 x i32> %y) { -; CHECK-LABEL: @t12_const_lshr_shl_ne_vec_undef3( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +define <3 x i1> @t12_const_lshr_shl_ne_vec_poison3(<3 x i32> %x, <3 x i32> %y) { +; CHECK-LABEL: @t12_const_lshr_shl_ne_vec_poison3( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[T3]] ; - %t0 = lshr <3 x i32> %x, - %t1 = shl <3 x i32> %y, + %t0 = lshr <3 x i32> %x, + %t1 = shl <3 x i32> %y, %t2 = and <3 x i32> %t1, %t0 %t3 = icmp ne <3 x i32> %t2, ret <3 x i1> %t3 } -define <3 x i1> @t13_const_lshr_shl_ne_vec_undef4(<3 x i32> %x, <3 x i32> %y) { -; CHECK-LABEL: @t13_const_lshr_shl_ne_vec_undef4( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +define <3 x i1> @t13_const_lshr_shl_ne_vec_poison4(<3 x i32> %x, <3 x i32> %y) { +; CHECK-LABEL: @t13_const_lshr_shl_ne_vec_poison4( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[T3]] ; %t0 = lshr <3 x i32> %x, - %t1 = shl <3 x i32> %y, + %t1 = shl <3 x i32> %y, %t2 = and <3 x i32> %t1, %t0 - %t3 = icmp ne <3 x i32> %t2, + %t3 = icmp ne <3 x i32> %t2, ret <3 x i1> %t3 } -define <3 x i1> @t14_const_lshr_shl_ne_vec_undef5(<3 x i32> %x, <3 x i32> %y) { -; CHECK-LABEL: @t14_const_lshr_shl_ne_vec_undef5( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +define <3 x i1> @t14_const_lshr_shl_ne_vec_poison5(<3 x i32> %x, <3 x i32> %y) { +; CHECK-LABEL: @t14_const_lshr_shl_ne_vec_poison5( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[T3]] ; - %t0 = lshr <3 x i32> %x, + %t0 = lshr <3 x i32> %x, %t1 = shl <3 x i32> %y, %t2 = and <3 x i32> %t1, %t0 - %t3 = icmp ne <3 x i32> %t2, + %t3 = icmp ne <3 x i32> %t2, ret <3 x i1> %t3 } -define <3 x i1> @t15_const_lshr_shl_ne_vec_undef6(<3 x i32> %x, <3 x i32> %y) { -; CHECK-LABEL: @t15_const_lshr_shl_ne_vec_undef6( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +define <3 x i1> @t15_const_lshr_shl_ne_vec_poison6(<3 x i32> %x, <3 x i32> %y) { +; CHECK-LABEL: @t15_const_lshr_shl_ne_vec_poison6( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]] ; CHECK-NEXT: [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: ret <3 x i1> [[T3]] ; - %t0 = lshr <3 x i32> %x, - %t1 = shl <3 x i32> %y, + %t0 = lshr <3 x i32> %x, + %t1 = shl <3 x i32> %y, %t2 = and <3 x i32> %t1, %t0 - %t3 = icmp ne <3 x i32> %t2, + %t3 = icmp ne <3 x i32> %t2, ret <3 x i1> %t3 } diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll index 84dd4c57ebc619..9efc30cc9d916e 100644 --- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll @@ -42,13 +42,13 @@ define <2 x i16> @t1_vec_splat(<2 x i32> %x, <2 x i16> %y) { ret <2 x i16> %t5 } -define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) { -; CHECK-LABEL: @t3_vec_nonsplat_undef0( -; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], +define <3 x i16> @t3_vec_nonsplat_poison0(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t3_vec_nonsplat_poison0( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], ; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> ; CHECK-NEXT: ret <3 x i16> [[T5]] ; - %t0 = sub <3 x i16> , %y + %t0 = sub <3 x i16> , %y %t1 = zext <3 x i16> %t0 to <3 x i32> %t2 = ashr <3 x i32> %x, %t1 %t3 = trunc <3 x i32> %t2 to <3 x i16> @@ -57,9 +57,9 @@ define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) { ret <3 x i16> %t5 } -define <3 x i16> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { -; CHECK-LABEL: @t4_vec_nonsplat_undef1( -; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], +define <3 x i16> @t4_vec_nonsplat_poison1(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t4_vec_nonsplat_poison1( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], ; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> ; CHECK-NEXT: ret <3 x i16> [[T5]] ; @@ -67,22 +67,22 @@ define <3 x i16> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { %t1 = zext <3 x i16> %t0 to <3 x i32> %t2 = ashr <3 x i32> %x, %t1 %t3 = trunc <3 x i32> %t2 to <3 x i16> - %t4 = add <3 x i16> %y, + %t4 = add <3 x i16> %y, %t5 = ashr <3 x i16> %t3, %t4 ret <3 x i16> %t5 } -define <3 x i16> @t5_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { -; CHECK-LABEL: @t5_vec_nonsplat_undef1( -; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], +define <3 x i16> @t5_vec_nonsplat_poison1(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t5_vec_nonsplat_poison1( +; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], ; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> ; CHECK-NEXT: ret <3 x i16> [[T5]] ; - %t0 = sub <3 x i16> , %y + %t0 = sub <3 x i16> , %y %t1 = zext <3 x i16> %t0 to <3 x i32> %t2 = ashr <3 x i32> %x, %t1 %t3 = trunc <3 x i32> %t2 to <3 x i16> - %t4 = add <3 x i16> %y, + %t4 = add <3 x i16> %y, %t5 = ashr <3 x i16> %t3, %t4 ret <3 x i16> %t5 } diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll index 214ec88d2e551d..c31b6ed3ea2ba9 100644 --- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll @@ -42,13 +42,13 @@ define <2 x i16> @t1_vec_splat(<2 x i32> %x, <2 x i16> %y) { ret <2 x i16> %t5 } -define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) { -; CHECK-LABEL: @t3_vec_nonsplat_undef0( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], -; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> +define <3 x i16> @t3_vec_nonsplat_poison0(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t3_vec_nonsplat_poison0( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc nuw nsw <3 x i32> [[TMP1]] to <3 x i16> ; CHECK-NEXT: ret <3 x i16> [[T5]] ; - %t0 = sub <3 x i16> , %y + %t0 = sub <3 x i16> , %y %t1 = zext <3 x i16> %t0 to <3 x i32> %t2 = lshr <3 x i32> %x, %t1 %t3 = trunc <3 x i32> %t2 to <3 x i16> @@ -57,32 +57,32 @@ define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) { ret <3 x i16> %t5 } -define <3 x i16> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { -; CHECK-LABEL: @t4_vec_nonsplat_undef1( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], -; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> +define <3 x i16> @t4_vec_nonsplat_poison1(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t4_vec_nonsplat_poison1( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc nuw nsw <3 x i32> [[TMP1]] to <3 x i16> ; CHECK-NEXT: ret <3 x i16> [[T5]] ; %t0 = sub <3 x i16> , %y %t1 = zext <3 x i16> %t0 to <3 x i32> %t2 = lshr <3 x i32> %x, %t1 %t3 = trunc <3 x i32> %t2 to <3 x i16> - %t4 = add <3 x i16> %y, + %t4 = add <3 x i16> %y, %t5 = lshr <3 x i16> %t3, %t4 ret <3 x i16> %t5 } -define <3 x i16> @t5_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) { -; CHECK-LABEL: @t5_vec_nonsplat_undef1( -; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], -; CHECK-NEXT: [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16> +define <3 x i16> @t5_vec_nonsplat_poison1(<3 x i32> %x, <3 x i16> %y) { +; CHECK-LABEL: @t5_vec_nonsplat_poison1( +; CHECK-NEXT: [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], +; CHECK-NEXT: [[T5:%.*]] = trunc nuw nsw <3 x i32> [[TMP1]] to <3 x i16> ; CHECK-NEXT: ret <3 x i16> [[T5]] ; - %t0 = sub <3 x i16> , %y + %t0 = sub <3 x i16> , %y %t1 = zext <3 x i16> %t0 to <3 x i32> %t2 = lshr <3 x i32> %x, %t1 %t3 = trunc <3 x i32> %t2 to <3 x i16> - %t4 = add <3 x i16> %y, + %t4 = add <3 x i16> %y, %t5 = lshr <3 x i16> %t3, %t4 ret <3 x i16> %t5 } diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll index b96bcd6bab4f14..6bbe4c5151e458 100644 --- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll +++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll @@ -48,38 +48,38 @@ define <2 x i32> @t2_vec_nonsplat(<2 x i32> %x, <2 x i32> %y) { ; Basic vector tests -define <3 x i32> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i32> %y) { -; CHECK-LABEL: @t3_vec_nonsplat_undef0( -; CHECK-NEXT: [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], +define <3 x i32> @t3_vec_nonsplat_poison0(<3 x i32> %x, <3 x i32> %y) { +; CHECK-LABEL: @t3_vec_nonsplat_poison0( +; CHECK-NEXT: [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], ; CHECK-NEXT: ret <3 x i32> [[T3]] ; - %t0 = sub <3 x i32> , %y + %t0 = sub <3 x i32> , %y %t1 = lshr <3 x i32> %x, %t0 %t2 = add <3 x i32> %y, %t3 = lshr <3 x i32> %t1, %t2 ret <3 x i32> %t3 } -define <3 x i32> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i32> %y) { -; CHECK-LABEL: @t4_vec_nonsplat_undef1( -; CHECK-NEXT: [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], +define <3 x i32> @t4_vec_nonsplat_poison1(<3 x i32> %x, <3 x i32> %y) { +; CHECK-LABEL: @t4_vec_nonsplat_poison1( +; CHECK-NEXT: [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], ; CHECK-NEXT: ret <3 x i32> [[T3]] ; %t0 = sub <3 x i32> , %y %t1 = lshr <3 x i32> %x, %t0 - %t2 = add <3 x i32> %y, + %t2 = add <3 x i32> %y, %t3 = lshr <3 x i32> %t1, %t2 ret <3 x i32> %t3 } -define <3 x i32> @t5_vec_nonsplat_undef1(<3 x i32> %x, <3 x i32> %y) { -; CHECK-LABEL: @t5_vec_nonsplat_undef1( -; CHECK-NEXT: [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], +define <3 x i32> @t5_vec_nonsplat_poison1(<3 x i32> %x, <3 x i32> %y) { +; CHECK-LABEL: @t5_vec_nonsplat_poison1( +; CHECK-NEXT: [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], ; CHECK-NEXT: ret <3 x i32> [[T3]] ; - %t0 = sub <3 x i32> , %y + %t0 = sub <3 x i32> , %y %t1 = lshr <3 x i32> %x, %t0 - %t2 = add <3 x i32> %y, + %t2 = add <3 x i32> %y, %t3 = lshr <3 x i32> %t1, %t2 ret <3 x i32> %t3 } diff --git a/llvm/test/Transforms/InstCombine/shift-logic.ll b/llvm/test/Transforms/InstCombine/shift-logic.ll index c982b45b504e9a..b591400c6a2603 100644 --- a/llvm/test/Transforms/InstCombine/shift-logic.ll +++ b/llvm/test/Transforms/InstCombine/shift-logic.ll @@ -44,18 +44,18 @@ define i16 @shl_or(i16 %x, i16 %py) { ret i16 %sh1 } -define <2 x i16> @shl_or_undef(<2 x i16> %x, <2 x i16> %py) { -; CHECK-LABEL: @shl_or_undef( +define <2 x i16> @shl_or_poison(<2 x i16> %x, <2 x i16> %py) { +; CHECK-LABEL: @shl_or_poison( ; CHECK-NEXT: [[Y:%.*]] = srem <2 x i16> [[PY:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i16> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i16> [[Y]], +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i16> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <2 x i16> [[Y]], ; CHECK-NEXT: [[SH1:%.*]] = or <2 x i16> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <2 x i16> [[SH1]] ; %y = srem <2 x i16> %py, ; thwart complexity-based canonicalization - %sh0 = shl <2 x i16> %x, + %sh0 = shl <2 x i16> %x, %r = or <2 x i16> %y, %sh0 - %sh1 = shl <2 x i16> %r, + %sh1 = shl <2 x i16> %r, ret <2 x i16> %sh1 } @@ -100,18 +100,18 @@ define i64 @lshr_and(i64 %x, i64 %py) { ret i64 %sh1 } -define <2 x i64> @lshr_and_undef(<2 x i64> %x, <2 x i64> %py) { -; CHECK-LABEL: @lshr_and_undef( +define <2 x i64> @lshr_and_poison(<2 x i64> %x, <2 x i64> %py) { +; CHECK-LABEL: @lshr_and_poison( ; CHECK-NEXT: [[Y:%.*]] = srem <2 x i64> [[PY:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i64> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[Y]], +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i64> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[Y]], ; CHECK-NEXT: [[SH1:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <2 x i64> [[SH1]] ; %y = srem <2 x i64> %py, ; thwart complexity-based canonicalization - %sh0 = lshr <2 x i64> %x, + %sh0 = lshr <2 x i64> %x, %r = and <2 x i64> %y, %sh0 - %sh1 = lshr <2 x i64> %r, + %sh1 = lshr <2 x i64> %r, ret <2 x i64> %sh1 } @@ -212,16 +212,16 @@ define i32 @ashr_overshift_xor(i32 %x, i32 %y) { ret i32 %sh1 } -define <2 x i32> @ashr_undef_undef_xor(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @ashr_undef_undef_xor( -; CHECK-NEXT: [[SH0:%.*]] = ashr <2 x i32> [[X:%.*]], +define <2 x i32> @ashr_poison_poison_xor(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @ashr_poison_poison_xor( +; CHECK-NEXT: [[SH0:%.*]] = ashr <2 x i32> [[X:%.*]], ; CHECK-NEXT: [[R:%.*]] = xor <2 x i32> [[SH0]], [[Y:%.*]] -; CHECK-NEXT: [[SH1:%.*]] = ashr <2 x i32> [[R]], +; CHECK-NEXT: [[SH1:%.*]] = ashr <2 x i32> [[R]], ; CHECK-NEXT: ret <2 x i32> [[SH1]] ; - %sh0 = ashr <2 x i32> %x, + %sh0 = ashr <2 x i32> %x, %r = xor <2 x i32> %y, %sh0 - %sh1 = ashr <2 x i32> %r, + %sh1 = ashr <2 x i32> %r, ret <2 x i32> %sh1 } @@ -390,18 +390,18 @@ define <2 x i8> @shl_add_nonuniform(<2 x i8> %x, <2 x i8> %y) { } -define <2 x i64> @shl_add_undef(<2 x i64> %x, <2 x i64> %py) { -; CHECK-LABEL: @shl_add_undef( +define <2 x i64> @shl_add_poison(<2 x i64> %x, <2 x i64> %py) { +; CHECK-LABEL: @shl_add_poison( ; CHECK-NEXT: [[Y:%.*]] = srem <2 x i64> [[PY:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i64> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[Y]], +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i64> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <2 x i64> [[Y]], ; CHECK-NEXT: [[SH1:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] ; CHECK-NEXT: ret <2 x i64> [[SH1]] ; %y = srem <2 x i64> %py, ; thwart complexity-based canonicalization - %sh0 = shl <2 x i64> %x, + %sh0 = shl <2 x i64> %x, %r = add <2 x i64> %y, %sh0 - %sh1 = shl <2 x i64> %r, + %sh1 = shl <2 x i64> %r, ret <2 x i64> %sh1 } @@ -432,18 +432,18 @@ define <2 x i8> @lshr_add_nonuniform(<2 x i8> %x, <2 x i8> %y) { ret <2 x i8> %sh1 } -define <2 x i64> @lshr_add_undef(<2 x i64> %x, <2 x i64> %py) { -; CHECK-LABEL: @lshr_add_undef( +define <2 x i64> @lshr_add_poison(<2 x i64> %x, <2 x i64> %py) { +; CHECK-LABEL: @lshr_add_poison( ; CHECK-NEXT: [[Y:%.*]] = srem <2 x i64> [[PY:%.*]], -; CHECK-NEXT: [[SH0:%.*]] = lshr <2 x i64> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = add <2 x i64> [[Y]], [[SH0]] -; CHECK-NEXT: [[SH1:%.*]] = lshr <2 x i64> [[R]], +; CHECK-NEXT: [[SH0:%.*]] = lshr <2 x i64> [[X:%.*]], +; CHECK-NEXT: [[R:%.*]] = add nsw <2 x i64> [[Y]], [[SH0]] +; CHECK-NEXT: [[SH1:%.*]] = lshr <2 x i64> [[R]], ; CHECK-NEXT: ret <2 x i64> [[SH1]] ; %y = srem <2 x i64> %py, ; thwart complexity-based canonicalization - %sh0 = lshr <2 x i64> %x, + %sh0 = lshr <2 x i64> %x, %r = add <2 x i64> %y, %sh0 - %sh1 = lshr <2 x i64> %r, + %sh1 = lshr <2 x i64> %r, ret <2 x i64> %sh1 } @@ -488,18 +488,18 @@ define <2 x i8> @shl_sub_nonuniform(<2 x i8> %x, <2 x i8> %y) { } -define <2 x i64> @shl_sub_undef(<2 x i64> %x, <2 x i64> %py) { -; CHECK-LABEL: @shl_sub_undef( +define <2 x i64> @shl_sub_poison(<2 x i64> %x, <2 x i64> %py) { +; CHECK-LABEL: @shl_sub_poison( ; CHECK-NEXT: [[Y:%.*]] = srem <2 x i64> [[PY:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i64> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[Y]], +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i64> [[X:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shl nsw <2 x i64> [[Y]], ; CHECK-NEXT: [[SH1:%.*]] = sub <2 x i64> [[TMP2]], [[TMP1]] ; CHECK-NEXT: ret <2 x i64> [[SH1]] ; %y = srem <2 x i64> %py, ; thwart complexity-based canonicalization - %sh0 = shl <2 x i64> %x, + %sh0 = shl <2 x i64> %x, %r = sub <2 x i64> %y, %sh0 - %sh1 = shl <2 x i64> %r, + %sh1 = shl <2 x i64> %r, ret <2 x i64> %sh1 } @@ -530,17 +530,17 @@ define <2 x i8> @lshr_sub_nonuniform(<2 x i8> %x, <2 x i8> %y) { ret <2 x i8> %sh1 } -define <2 x i64> @lshr_sub_undef(<2 x i64> %x, <2 x i64> %py) { -; CHECK-LABEL: @lshr_sub_undef( +define <2 x i64> @lshr_sub_poison(<2 x i64> %x, <2 x i64> %py) { +; CHECK-LABEL: @lshr_sub_poison( ; CHECK-NEXT: [[Y:%.*]] = srem <2 x i64> [[PY:%.*]], -; CHECK-NEXT: [[SH0:%.*]] = lshr <2 x i64> [[X:%.*]], -; CHECK-NEXT: [[R:%.*]] = sub <2 x i64> [[Y]], [[SH0]] -; CHECK-NEXT: [[SH1:%.*]] = lshr <2 x i64> [[R]], +; CHECK-NEXT: [[SH0:%.*]] = lshr <2 x i64> [[X:%.*]], +; CHECK-NEXT: [[R:%.*]] = sub nsw <2 x i64> [[Y]], [[SH0]] +; CHECK-NEXT: [[SH1:%.*]] = lshr <2 x i64> [[R]], ; CHECK-NEXT: ret <2 x i64> [[SH1]] ; %y = srem <2 x i64> %py, ; thwart complexity-based canonicalization - %sh0 = lshr <2 x i64> %x, + %sh0 = lshr <2 x i64> %x, %r = sub <2 x i64> %y, %sh0 - %sh1 = lshr <2 x i64> %r, + %sh1 = lshr <2 x i64> %r, ret <2 x i64> %sh1 } diff --git a/llvm/test/Transforms/InstCombine/shl-and-negC-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/shl-and-negC-icmpeq-zero.ll index 406dc72f2646e5..daa49557965943 100644 --- a/llvm/test/Transforms/InstCombine/shl-and-negC-icmpeq-zero.ll +++ b/llvm/test/Transforms/InstCombine/shl-and-negC-icmpeq-zero.ll @@ -81,39 +81,39 @@ define <4 x i1> @vec_4xi32_shl_and_negC_eq(<4 x i32> %x, <4 x i32> %y) { ret <4 x i1> %r } -define <4 x i1> @vec_shl_and_negC_eq_undef1(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_shl_and_negC_eq_undef1( +define <4 x i1> @vec_shl_and_negC_eq_poison1(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_shl_and_negC_eq_poison1( ; CHECK-NEXT: [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp ult <4 x i32> [[SHL]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %shl = shl <4 x i32> %x, %y - %and = and <4 x i32> %shl, ; ~7 + %and = and <4 x i32> %shl, ; ~7 %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } -define <4 x i1> @vec_shl_and_negC_eq_undef2(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_shl_and_negC_eq_undef2( +define <4 x i1> @vec_shl_and_negC_eq_poison2(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_shl_and_negC_eq_poison2( ; CHECK-NEXT: [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp ult <4 x i32> [[SHL]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %shl = shl <4 x i32> %x, %y %and = and <4 x i32> %shl, ; ~7 - %r = icmp eq <4 x i32> %and, + %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } -define <4 x i1> @vec_shl_and_negC_eq_undef3(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_shl_and_negC_eq_undef3( +define <4 x i1> @vec_shl_and_negC_eq_poison3(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_shl_and_negC_eq_poison3( ; CHECK-NEXT: [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp ult <4 x i32> [[SHL]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %shl = shl <4 x i32> %x, %y - %and = and <4 x i32> %shl, ; ~7 - %r = icmp eq <4 x i32> %and, + %and = and <4 x i32> %shl, ; ~7 + %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/shl-and-signbit-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/shl-and-signbit-icmpeq-zero.ll index 4c2c876e3925bf..dcc181945357da 100644 --- a/llvm/test/Transforms/InstCombine/shl-and-signbit-icmpeq-zero.ll +++ b/llvm/test/Transforms/InstCombine/shl-and-signbit-icmpeq-zero.ll @@ -81,39 +81,39 @@ define <4 x i1> @vec_4xi32_shl_and_signbit_eq(<4 x i32> %x, <4 x i32> %y) { ret <4 x i1> %r } -define <4 x i1> @vec_4xi32_shl_and_signbit_eq_undef1(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_undef1( +define <4 x i1> @vec_4xi32_shl_and_signbit_eq_poison1(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_poison1( ; CHECK-NEXT: [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt <4 x i32> [[SHL]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %shl = shl <4 x i32> %x, %y - %and = and <4 x i32> %shl, + %and = and <4 x i32> %shl, %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } -define <4 x i1> @vec_4xi32_shl_and_signbit_eq_undef2(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_undef2( +define <4 x i1> @vec_4xi32_shl_and_signbit_eq_poison2(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_poison2( ; CHECK-NEXT: [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt <4 x i32> [[SHL]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %shl = shl <4 x i32> %x, %y %and = and <4 x i32> %shl, - %r = icmp eq <4 x i32> %and, + %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } -define <4 x i1> @vec_4xi32_shl_and_signbit_eq_undef3(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_undef3( +define <4 x i1> @vec_4xi32_shl_and_signbit_eq_poison3(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_poison3( ; CHECK-NEXT: [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[R:%.*]] = icmp sgt <4 x i32> [[SHL]], ; CHECK-NEXT: ret <4 x i1> [[R]] ; %shl = shl <4 x i32> %x, %y - %and = and <4 x i32> %shl, - %r = icmp eq <4 x i32> %and, + %and = and <4 x i32> %shl, + %r = icmp eq <4 x i32> %and, ret <4 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/signmask-of-sext-vs-of-shl-of-zext.ll b/llvm/test/Transforms/InstCombine/signmask-of-sext-vs-of-shl-of-zext.ll index aeb4c8bb62cba6..e7505721cad604 100644 --- a/llvm/test/Transforms/InstCombine/signmask-of-sext-vs-of-shl-of-zext.ll +++ b/llvm/test/Transforms/InstCombine/signmask-of-sext-vs-of-shl-of-zext.ll @@ -129,40 +129,56 @@ define <2 x i32> @t8(<2 x i16> %x) { %r = and <2 x i32> %i1, ret <2 x i32> %r } + define <2 x i32> @t9(<2 x i16> %x) { ; CHECK-LABEL: @t9( -; CHECK-NEXT: [[X_SIGNEXT:%.*]] = sext <2 x i16> [[X:%.*]] to <2 x i32> -; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[X_SIGNEXT]], +; CHECK-NEXT: [[I1:%.*]] = sext <2 x i16> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[I1]], ; CHECK-NEXT: ret <2 x i32> [[R]] ; %i0 = zext <2 x i16> %x to <2 x i32> - %i1 = shl <2 x i32> %i0, + %i1 = shl <2 x i32> %i0, %r = and <2 x i32> %i1, - ; Here undef can be propagated into the mask. ret <2 x i32> %r } -define <2 x i32> @t10(<2 x i16> %x) { -; CHECK-LABEL: @t10( -; CHECK-NEXT: [[X_SIGNEXT:%.*]] = sext <2 x i16> [[X:%.*]] to <2 x i32> -; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[X_SIGNEXT]], + +; If we folded this, we wouldn't be able to keep the undef mask. +define <2 x i32> @t10_undef(<2 x i16> %x) { +; CHECK-LABEL: @t10_undef( +; CHECK-NEXT: [[I0:%.*]] = zext <2 x i16> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[I1:%.*]] = shl nuw <2 x i32> [[I0]], +; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[I1]], ; CHECK-NEXT: ret <2 x i32> [[R]] ; %i0 = zext <2 x i16> %x to <2 x i32> %i1 = shl <2 x i32> %i0, %r = and <2 x i32> %i1, - ; CAREFUL! We can't keep undef mask here, since high bits are no longer zero, + ret <2 x i32> %r +} + +define <2 x i32> @t10_poison(<2 x i16> %x) { +; CHECK-LABEL: @t10_poison( +; CHECK-NEXT: [[I1:%.*]] = sext <2 x i16> [[X:%.*]] to <2 x i32> +; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[I1]], +; CHECK-NEXT: ret <2 x i32> [[R]] +; + %i0 = zext <2 x i16> %x to <2 x i32> + %i1 = shl <2 x i32> %i0, + %r = and <2 x i32> %i1, + ; CAREFUL! We can't keep poison mask here, since high bits are no longer zero, ; we must sanitize it to 0. ret <2 x i32> %r } + define <2 x i32> @t11(<2 x i16> %x) { ; CHECK-LABEL: @t11( ; CHECK-NEXT: [[X_SIGNEXT:%.*]] = sext <2 x i16> [[X:%.*]] to <2 x i32> -; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[X_SIGNEXT]], +; CHECK-NEXT: [[R:%.*]] = and <2 x i32> [[X_SIGNEXT]], ; CHECK-NEXT: ret <2 x i32> [[R]] ; %i0 = zext <2 x i16> %x to <2 x i32> - %i1 = shl <2 x i32> %i0, - %r = and <2 x i32> %i1, - ; Here undef mask is fine. + %i1 = shl <2 x i32> %i0, + %r = and <2 x i32> %i1, + ; Here poison mask is fine. ret <2 x i32> %r } diff --git a/llvm/test/Transforms/InstCombine/sub-not.ll b/llvm/test/Transforms/InstCombine/sub-not.ll index ec36754d3e9b1d..89ccf5aa3c8f4f 100644 --- a/llvm/test/Transforms/InstCombine/sub-not.ll +++ b/llvm/test/Transforms/InstCombine/sub-not.ll @@ -34,7 +34,7 @@ define <2 x i8> @sub_not_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-NEXT: ret <2 x i8> [[R]] ; %s = sub <2 x i8> %x, %y - %r = xor <2 x i8> %s, + %r = xor <2 x i8> %s, ret <2 x i8> %r } @@ -69,7 +69,7 @@ define <2 x i8> @dec_sub_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-NEXT: ret <2 x i8> [[R]] ; %s = sub <2 x i8> %x, %y - %r = add <2 x i8> %s, + %r = add <2 x i8> %s, ret <2 x i8> %r } @@ -103,7 +103,7 @@ define <2 x i8> @sub_inc_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[S_NEG]], [[Y:%.*]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; - %s = add <2 x i8> %x, + %s = add <2 x i8> %x, %r = sub <2 x i8> %y, %s ret <2 x i8> %r } @@ -138,7 +138,7 @@ define <2 x i8> @sub_dec_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-NEXT: [[R:%.*]] = add <2 x i8> [[TMP1]], [[X:%.*]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; - %s = add <2 x i8> %x, + %s = add <2 x i8> %x, %r = sub <2 x i8> %s, %y ret <2 x i8> %r } diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll index 249b5673c8acfd..a84e389f13c3b8 100644 --- a/llvm/test/Transforms/InstCombine/sub.ll +++ b/llvm/test/Transforms/InstCombine/sub.ll @@ -130,44 +130,44 @@ define <2 x i32> @neg_nsw_sub_nsw_vec(<2 x i32> %x, <2 x i32> %y) { ret <2 x i32> %r } -define <2 x i32> @neg_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @neg_sub_vec_undef( +define <2 x i32> @neg_sub_vec_poison(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @neg_sub_vec_poison( ; CHECK-NEXT: [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret <2 x i32> [[R]] ; - %neg = sub <2 x i32> , %x + %neg = sub <2 x i32> , %x %r = sub <2 x i32> %y, %neg ret <2 x i32> %r } -define <2 x i32> @neg_nsw_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @neg_nsw_sub_vec_undef( +define <2 x i32> @neg_nsw_sub_vec_poison(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @neg_nsw_sub_vec_poison( ; CHECK-NEXT: [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret <2 x i32> [[R]] ; - %neg = sub nsw <2 x i32> , %x + %neg = sub nsw <2 x i32> , %x %r = sub <2 x i32> %y, %neg ret <2 x i32> %r } -define <2 x i32> @neg_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @neg_sub_nsw_vec_undef( +define <2 x i32> @neg_sub_nsw_vec_poison(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @neg_sub_nsw_vec_poison( ; CHECK-NEXT: [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret <2 x i32> [[R]] ; - %neg = sub <2 x i32> , %x + %neg = sub <2 x i32> , %x %r = sub nsw <2 x i32> %y, %neg ret <2 x i32> %r } ; This should not drop 'nsw'. -define <2 x i32> @neg_nsw_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) { -; CHECK-LABEL: @neg_nsw_sub_nsw_vec_undef( +define <2 x i32> @neg_nsw_sub_nsw_vec_poison(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @neg_nsw_sub_nsw_vec_poison( ; CHECK-NEXT: [[R:%.*]] = add nsw <2 x i32> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret <2 x i32> [[R]] ; - %neg = sub nsw <2 x i32> , %x + %neg = sub nsw <2 x i32> , %x %r = sub nsw <2 x i32> %y, %neg ret <2 x i32> %r } @@ -205,13 +205,13 @@ define <2 x i8> @notnotsub_vec(<2 x i8> %x, <2 x i8> %y) { ret <2 x i8> %sub } -define <2 x i8> @notnotsub_vec_undef_elts(<2 x i8> %x, <2 x i8> %y) { -; CHECK-LABEL: @notnotsub_vec_undef_elts( +define <2 x i8> @notnotsub_vec_poison_elts(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: @notnotsub_vec_poison_elts( ; CHECK-NEXT: [[SUB:%.*]] = sub <2 x i8> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: ret <2 x i8> [[SUB]] ; - %nx = xor <2 x i8> %x, - %ny = xor <2 x i8> %y, + %nx = xor <2 x i8> %x, + %ny = xor <2 x i8> %y, %sub = sub <2 x i8> %nx, %ny ret <2 x i8> %sub } @@ -2351,12 +2351,12 @@ define <2 x i8> @sub_to_and_vector1(<2 x i8> %x) { define <2 x i8> @sub_to_and_vector2(<2 x i8> %x) { ; CHECK-LABEL: @sub_to_and_vector2( -; CHECK-NEXT: [[SUB:%.*]] = sub nuw <2 x i8> , [[X:%.*]] +; CHECK-NEXT: [[SUB:%.*]] = sub nuw <2 x i8> , [[X:%.*]] ; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[SUB]], ; CHECK-NEXT: [[R:%.*]] = sub nsw <2 x i8> , [[AND]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; - %sub = sub nuw <2 x i8> , %x + %sub = sub nuw <2 x i8> , %x %and = and <2 x i8> %sub, %r = sub <2 x i8> , %and ret <2 x i8> %r @@ -2366,12 +2366,12 @@ define <2 x i8> @sub_to_and_vector2(<2 x i8> %x) { define <2 x i8> @sub_to_and_vector3(<2 x i8> %x) { ; CHECK-LABEL: @sub_to_and_vector3( ; CHECK-NEXT: [[SUB:%.*]] = sub nuw <2 x i8> , [[X:%.*]] -; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[SUB]], +; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[SUB]], ; CHECK-NEXT: [[R:%.*]] = sub nsw <2 x i8> , [[AND]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %sub = sub nuw <2 x i8> , %x - %and = and <2 x i8> %sub, + %and = and <2 x i8> %sub, %r = sub <2 x i8> , %and ret <2 x i8> %r } @@ -2381,12 +2381,12 @@ define <2 x i8> @sub_to_and_vector4(<2 x i8> %x) { ; CHECK-LABEL: @sub_to_and_vector4( ; CHECK-NEXT: [[SUB:%.*]] = sub nuw <2 x i8> , [[X:%.*]] ; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[SUB]], -; CHECK-NEXT: [[R:%.*]] = sub <2 x i8> , [[AND]] +; CHECK-NEXT: [[R:%.*]] = sub nsw <2 x i8> , [[AND]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %sub = sub nuw <2 x i8> , %x %and = and <2 x i8> %sub, - %r = sub <2 x i8> , %and + %r = sub <2 x i8> , %and ret <2 x i8> %r } diff --git a/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll b/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll index 4c857125365a9b..063006ba5eea8b 100644 --- a/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll @@ -49,15 +49,15 @@ define <2 x i64> @test1_vec_nonuniform(<2 x i64> %a) { ret <2 x i64> %d } -define <2 x i64> @test1_vec_undef(<2 x i64> %a) { -; CHECK-LABEL: @test1_vec_undef( +define <2 x i64> @test1_vec_poison(<2 x i64> %a) { +; CHECK-LABEL: @test1_vec_poison( ; CHECK-NEXT: [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32> -; CHECK-NEXT: [[D:%.*]] = and <2 x i64> [[A]], +; CHECK-NEXT: [[D:%.*]] = and <2 x i64> [[A]], ; CHECK-NEXT: call void @use_vec(<2 x i32> [[B]]) ; CHECK-NEXT: ret <2 x i64> [[D]] ; %b = trunc <2 x i64> %a to <2 x i32> - %c = and <2 x i32> %b, + %c = and <2 x i32> %b, %d = zext <2 x i32> %c to <2 x i64> call void @use_vec(<2 x i32> %b) ret <2 x i64> %d @@ -111,17 +111,17 @@ define <2 x i64> @test2_vec_nonuniform(<2 x i64> %a) { ret <2 x i64> %d } -define <2 x i64> @test2_vec_undef(<2 x i64> %a) { -; CHECK-LABEL: @test2_vec_undef( +define <2 x i64> @test2_vec_poison(<2 x i64> %a) { +; CHECK-LABEL: @test2_vec_poison( ; CHECK-NEXT: [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32> -; CHECK-NEXT: [[D1:%.*]] = shl <2 x i64> [[A]], -; CHECK-NEXT: [[D:%.*]] = ashr exact <2 x i64> [[D1]], +; CHECK-NEXT: [[D1:%.*]] = shl <2 x i64> [[A]], +; CHECK-NEXT: [[D:%.*]] = ashr exact <2 x i64> [[D1]], ; CHECK-NEXT: call void @use_vec(<2 x i32> [[B]]) ; CHECK-NEXT: ret <2 x i64> [[D]] ; %b = trunc <2 x i64> %a to <2 x i32> - %c = shl <2 x i32> %b, - %q = ashr <2 x i32> %c, + %c = shl <2 x i32> %b, + %q = ashr <2 x i32> %c, %d = sext <2 x i32> %q to <2 x i64> call void @use_vec(<2 x i32> %b) ret <2 x i64> %d @@ -300,18 +300,17 @@ define <2 x i64> @test8_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) { ret <2 x i64> %G } -define <2 x i64> @test8_vec_undef(<2 x i32> %A, <2 x i32> %B) { -; CHECK-LABEL: @test8_vec_undef( -; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128> -; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128> -; CHECK-NEXT: [[E:%.*]] = shl <2 x i128> [[D]], -; CHECK-NEXT: [[F:%.*]] = or <2 x i128> [[E]], [[C]] -; CHECK-NEXT: [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64> +define <2 x i64> @test8_vec_poison(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: @test8_vec_poison( +; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i64> +; CHECK-NEXT: [[E:%.*]] = shl nuw <2 x i64> [[D]], +; CHECK-NEXT: [[G:%.*]] = or disjoint <2 x i64> [[E]], [[C]] ; CHECK-NEXT: ret <2 x i64> [[G]] ; %C = zext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> - %E = shl <2 x i128> %D, + %E = shl <2 x i128> %D, %F = or <2 x i128> %E, %C %G = trunc <2 x i128> %F to <2 x i64> ret <2 x i64> %G @@ -388,18 +387,17 @@ define <2 x i64> @test11_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) { ret <2 x i64> %G } -define <2 x i64> @test11_vec_undef(<2 x i32> %A, <2 x i32> %B) { -; CHECK-LABEL: @test11_vec_undef( -; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128> -; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128> -; CHECK-NEXT: [[E:%.*]] = and <2 x i128> [[D]], -; CHECK-NEXT: [[F:%.*]] = shl <2 x i128> [[C]], [[E]] -; CHECK-NEXT: [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64> +define <2 x i64> @test11_vec_poison(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: @test11_vec_poison( +; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], +; CHECK-NEXT: [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[G:%.*]] = shl nuw nsw <2 x i64> [[C]], [[E]] ; CHECK-NEXT: ret <2 x i64> [[G]] ; %C = zext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> - %E = and <2 x i128> %D, + %E = and <2 x i128> %D, %F = shl <2 x i128> %C, %E %G = trunc <2 x i128> %F to <2 x i64> ret <2 x i64> %G @@ -453,18 +451,17 @@ define <2 x i64> @test12_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) { ret <2 x i64> %G } -define <2 x i64> @test12_vec_undef(<2 x i32> %A, <2 x i32> %B) { -; CHECK-LABEL: @test12_vec_undef( -; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128> -; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128> -; CHECK-NEXT: [[E:%.*]] = and <2 x i128> [[D]], -; CHECK-NEXT: [[F:%.*]] = lshr <2 x i128> [[C]], [[E]] -; CHECK-NEXT: [[G:%.*]] = trunc nuw nsw <2 x i128> [[F]] to <2 x i64> +define <2 x i64> @test12_vec_poison(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: @test12_vec_poison( +; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], +; CHECK-NEXT: [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[G:%.*]] = lshr <2 x i64> [[C]], [[E]] ; CHECK-NEXT: ret <2 x i64> [[G]] ; %C = zext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> - %E = and <2 x i128> %D, + %E = and <2 x i128> %D, %F = lshr <2 x i128> %C, %E %G = trunc <2 x i128> %F to <2 x i64> ret <2 x i64> %G @@ -518,18 +515,17 @@ define <2 x i64> @test13_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) { ret <2 x i64> %G } -define <2 x i64> @test13_vec_undef(<2 x i32> %A, <2 x i32> %B) { -; CHECK-LABEL: @test13_vec_undef( -; CHECK-NEXT: [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i128> -; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128> -; CHECK-NEXT: [[E:%.*]] = and <2 x i128> [[D]], -; CHECK-NEXT: [[F:%.*]] = ashr <2 x i128> [[C]], [[E]] -; CHECK-NEXT: [[G:%.*]] = trunc nsw <2 x i128> [[F]] to <2 x i64> +define <2 x i64> @test13_vec_poison(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: @test13_vec_poison( +; CHECK-NEXT: [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], +; CHECK-NEXT: [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[G:%.*]] = ashr <2 x i64> [[C]], [[E]] ; CHECK-NEXT: ret <2 x i64> [[G]] ; %C = sext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> - %E = and <2 x i128> %D, + %E = and <2 x i128> %D, %F = ashr <2 x i128> %C, %E %G = trunc <2 x i128> %F to <2 x i64> ret <2 x i64> %G @@ -766,13 +762,13 @@ define <2 x i32> @trunc_shl_v2i32_v2i64_uniform(<2 x i64> %val) { ret <2 x i32> %trunc } -define <2 x i32> @trunc_shl_v2i32_v2i64_undef(<2 x i64> %val) { -; CHECK-LABEL: @trunc_shl_v2i32_v2i64_undef( +define <2 x i32> @trunc_shl_v2i32_v2i64_poison(<2 x i64> %val) { +; CHECK-LABEL: @trunc_shl_v2i32_v2i64_poison( ; CHECK-NEXT: [[VAL_TR:%.*]] = trunc <2 x i64> [[VAL:%.*]] to <2 x i32> -; CHECK-NEXT: [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], +; CHECK-NEXT: [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], ; CHECK-NEXT: ret <2 x i32> [[TRUNC]] ; - %shl = shl <2 x i64> %val, + %shl = shl <2 x i64> %val, %trunc = trunc <2 x i64> %shl to <2 x i32> ret <2 x i32> %trunc } @@ -917,7 +913,7 @@ define <4 x i8> @wide_shuf(<4 x i32> %x) { ret <4 x i8> %trunc } -; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask +; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask define <4 x i8> @wide_splat1(<4 x i32> %x) { ; CHECK-LABEL: @wide_splat1( @@ -931,7 +927,7 @@ define <4 x i8> @wide_splat1(<4 x i32> %x) { } ; Test weird types. -; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask +; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask define <3 x i31> @wide_splat2(<3 x i33> %x) { ; CHECK-LABEL: @wide_splat2( @@ -945,8 +941,8 @@ define <3 x i31> @wide_splat2(<3 x i33> %x) { } ; FIXME: -; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask -; A mask with undef elements should still be considered a splat mask. +; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask +; A mask with poison elements should still be considered a splat mask. define <3 x i31> @wide_splat3(<3 x i33> %x) { ; CHECK-LABEL: @wide_splat3( @@ -954,7 +950,7 @@ define <3 x i31> @wide_splat3(<3 x i33> %x) { ; CHECK-NEXT: [[TRUNC:%.*]] = trunc <3 x i33> [[SHUF]] to <3 x i31> ; CHECK-NEXT: ret <3 x i31> [[TRUNC]] ; - %shuf = shufflevector <3 x i33> %x, <3 x i33> poison, <3 x i32> + %shuf = shufflevector <3 x i33> %x, <3 x i33> poison, <3 x i32> %trunc = trunc <3 x i33> %shuf to <3 x i31> ret <3 x i31> %trunc } diff --git a/llvm/test/Transforms/InstCombine/trunc-shift-trunc.ll b/llvm/test/Transforms/InstCombine/trunc-shift-trunc.ll index 2c5f428cf98de5..c50a3d06d24b9c 100644 --- a/llvm/test/Transforms/InstCombine/trunc-shift-trunc.ll +++ b/llvm/test/Transforms/InstCombine/trunc-shift-trunc.ll @@ -56,14 +56,14 @@ define <2 x i8> @trunc_lshr_trunc_nonuniform(<2 x i64> %a) { ret <2 x i8> %d } -define <2 x i8> @trunc_lshr_trunc_uniform_undef(<2 x i64> %a) { -; CHECK-LABEL: @trunc_lshr_trunc_uniform_undef( -; CHECK-NEXT: [[C1:%.*]] = lshr <2 x i64> [[A:%.*]], +define <2 x i8> @trunc_lshr_trunc_uniform_poison(<2 x i64> %a) { +; CHECK-LABEL: @trunc_lshr_trunc_uniform_poison( +; CHECK-NEXT: [[C1:%.*]] = lshr <2 x i64> [[A:%.*]], ; CHECK-NEXT: [[D:%.*]] = trunc <2 x i64> [[C1]] to <2 x i8> ; CHECK-NEXT: ret <2 x i8> [[D]] ; %b = trunc <2 x i64> %a to <2 x i32> - %c = lshr <2 x i32> %b, + %c = lshr <2 x i32> %b, %d = trunc <2 x i32> %c to <2 x i8> ret <2 x i8> %d } @@ -142,14 +142,14 @@ define <2 x i8> @trunc_ashr_trunc_nonuniform(<2 x i64> %a) { ret <2 x i8> %d } -define <2 x i8> @trunc_ashr_trunc_uniform_undef(<2 x i64> %a) { -; CHECK-LABEL: @trunc_ashr_trunc_uniform_undef( -; CHECK-NEXT: [[C1:%.*]] = ashr <2 x i64> [[A:%.*]], +define <2 x i8> @trunc_ashr_trunc_uniform_poison(<2 x i64> %a) { +; CHECK-LABEL: @trunc_ashr_trunc_uniform_poison( +; CHECK-NEXT: [[C1:%.*]] = ashr <2 x i64> [[A:%.*]], ; CHECK-NEXT: [[D:%.*]] = trunc <2 x i64> [[C1]] to <2 x i8> ; CHECK-NEXT: ret <2 x i8> [[D]] ; %b = trunc <2 x i64> %a to <2 x i32> - %c = ashr <2 x i32> %b, + %c = ashr <2 x i32> %b, %d = trunc <2 x i32> %c to <2 x i8> ret <2 x i8> %d } diff --git a/llvm/test/Transforms/InstCombine/trunc.ll b/llvm/test/Transforms/InstCombine/trunc.ll index c77d7269f2cf7d..e59b2bea6684c0 100644 --- a/llvm/test/Transforms/InstCombine/trunc.ll +++ b/llvm/test/Transforms/InstCombine/trunc.ll @@ -49,15 +49,15 @@ define <2 x i64> @test1_vec_nonuniform(<2 x i64> %a) { ret <2 x i64> %d } -define <2 x i64> @test1_vec_undef(<2 x i64> %a) { -; CHECK-LABEL: @test1_vec_undef( +define <2 x i64> @test1_vec_poison(<2 x i64> %a) { +; CHECK-LABEL: @test1_vec_poison( ; CHECK-NEXT: [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32> -; CHECK-NEXT: [[D:%.*]] = and <2 x i64> [[A]], +; CHECK-NEXT: [[D:%.*]] = and <2 x i64> [[A]], ; CHECK-NEXT: call void @use_vec(<2 x i32> [[B]]) ; CHECK-NEXT: ret <2 x i64> [[D]] ; %b = trunc <2 x i64> %a to <2 x i32> - %c = and <2 x i32> %b, + %c = and <2 x i32> %b, %d = zext <2 x i32> %c to <2 x i64> call void @use_vec(<2 x i32> %b) ret <2 x i64> %d @@ -111,17 +111,17 @@ define <2 x i64> @test2_vec_nonuniform(<2 x i64> %a) { ret <2 x i64> %d } -define <2 x i64> @test2_vec_undef(<2 x i64> %a) { -; CHECK-LABEL: @test2_vec_undef( +define <2 x i64> @test2_vec_poison(<2 x i64> %a) { +; CHECK-LABEL: @test2_vec_poison( ; CHECK-NEXT: [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32> -; CHECK-NEXT: [[D1:%.*]] = shl <2 x i64> [[A]], -; CHECK-NEXT: [[D:%.*]] = ashr exact <2 x i64> [[D1]], +; CHECK-NEXT: [[D1:%.*]] = shl <2 x i64> [[A]], +; CHECK-NEXT: [[D:%.*]] = ashr exact <2 x i64> [[D1]], ; CHECK-NEXT: call void @use_vec(<2 x i32> [[B]]) ; CHECK-NEXT: ret <2 x i64> [[D]] ; %b = trunc <2 x i64> %a to <2 x i32> - %c = shl <2 x i32> %b, - %q = ashr <2 x i32> %c, + %c = shl <2 x i32> %b, + %q = ashr <2 x i32> %c, %d = sext <2 x i32> %q to <2 x i64> call void @use_vec(<2 x i32> %b) ret <2 x i64> %d @@ -300,18 +300,17 @@ define <2 x i64> @test8_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) { ret <2 x i64> %G } -define <2 x i64> @test8_vec_undef(<2 x i32> %A, <2 x i32> %B) { -; CHECK-LABEL: @test8_vec_undef( -; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128> -; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128> -; CHECK-NEXT: [[E:%.*]] = shl <2 x i128> [[D]], -; CHECK-NEXT: [[F:%.*]] = or <2 x i128> [[E]], [[C]] -; CHECK-NEXT: [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64> +define <2 x i64> @test8_vec_poison(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: @test8_vec_poison( +; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i64> +; CHECK-NEXT: [[E:%.*]] = shl nuw <2 x i64> [[D]], +; CHECK-NEXT: [[G:%.*]] = or disjoint <2 x i64> [[E]], [[C]] ; CHECK-NEXT: ret <2 x i64> [[G]] ; %C = zext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> - %E = shl <2 x i128> %D, + %E = shl <2 x i128> %D, %F = or <2 x i128> %E, %C %G = trunc <2 x i128> %F to <2 x i64> ret <2 x i64> %G @@ -388,18 +387,17 @@ define <2 x i64> @test11_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) { ret <2 x i64> %G } -define <2 x i64> @test11_vec_undef(<2 x i32> %A, <2 x i32> %B) { -; CHECK-LABEL: @test11_vec_undef( -; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128> -; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128> -; CHECK-NEXT: [[E:%.*]] = and <2 x i128> [[D]], -; CHECK-NEXT: [[F:%.*]] = shl <2 x i128> [[C]], [[E]] -; CHECK-NEXT: [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64> +define <2 x i64> @test11_vec_poison(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: @test11_vec_poison( +; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], +; CHECK-NEXT: [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[G:%.*]] = shl nuw nsw <2 x i64> [[C]], [[E]] ; CHECK-NEXT: ret <2 x i64> [[G]] ; %C = zext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> - %E = and <2 x i128> %D, + %E = and <2 x i128> %D, %F = shl <2 x i128> %C, %E %G = trunc <2 x i128> %F to <2 x i64> ret <2 x i64> %G @@ -453,18 +451,17 @@ define <2 x i64> @test12_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) { ret <2 x i64> %G } -define <2 x i64> @test12_vec_undef(<2 x i32> %A, <2 x i32> %B) { -; CHECK-LABEL: @test12_vec_undef( -; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128> -; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128> -; CHECK-NEXT: [[E:%.*]] = and <2 x i128> [[D]], -; CHECK-NEXT: [[F:%.*]] = lshr <2 x i128> [[C]], [[E]] -; CHECK-NEXT: [[G:%.*]] = trunc nuw nsw <2 x i128> [[F]] to <2 x i64> +define <2 x i64> @test12_vec_poison(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: @test12_vec_poison( +; CHECK-NEXT: [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], +; CHECK-NEXT: [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[G:%.*]] = lshr <2 x i64> [[C]], [[E]] ; CHECK-NEXT: ret <2 x i64> [[G]] ; %C = zext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> - %E = and <2 x i128> %D, + %E = and <2 x i128> %D, %F = lshr <2 x i128> %C, %E %G = trunc <2 x i128> %F to <2 x i64> ret <2 x i64> %G @@ -518,18 +515,17 @@ define <2 x i64> @test13_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) { ret <2 x i64> %G } -define <2 x i64> @test13_vec_undef(<2 x i32> %A, <2 x i32> %B) { -; CHECK-LABEL: @test13_vec_undef( -; CHECK-NEXT: [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i128> -; CHECK-NEXT: [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128> -; CHECK-NEXT: [[E:%.*]] = and <2 x i128> [[D]], -; CHECK-NEXT: [[F:%.*]] = ashr <2 x i128> [[C]], [[E]] -; CHECK-NEXT: [[G:%.*]] = trunc nsw <2 x i128> [[F]] to <2 x i64> +define <2 x i64> @test13_vec_poison(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: @test13_vec_poison( +; CHECK-NEXT: [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], +; CHECK-NEXT: [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[G:%.*]] = ashr <2 x i64> [[C]], [[E]] ; CHECK-NEXT: ret <2 x i64> [[G]] ; %C = sext <2 x i32> %A to <2 x i128> %D = zext <2 x i32> %B to <2 x i128> - %E = and <2 x i128> %D, + %E = and <2 x i128> %D, %F = ashr <2 x i128> %C, %E %G = trunc <2 x i128> %F to <2 x i64> ret <2 x i64> %G @@ -766,13 +762,13 @@ define <2 x i32> @trunc_shl_v2i32_v2i64_uniform(<2 x i64> %val) { ret <2 x i32> %trunc } -define <2 x i32> @trunc_shl_v2i32_v2i64_undef(<2 x i64> %val) { -; CHECK-LABEL: @trunc_shl_v2i32_v2i64_undef( +define <2 x i32> @trunc_shl_v2i32_v2i64_poison(<2 x i64> %val) { +; CHECK-LABEL: @trunc_shl_v2i32_v2i64_poison( ; CHECK-NEXT: [[VAL_TR:%.*]] = trunc <2 x i64> [[VAL:%.*]] to <2 x i32> -; CHECK-NEXT: [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], +; CHECK-NEXT: [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], ; CHECK-NEXT: ret <2 x i32> [[TRUNC]] ; - %shl = shl <2 x i64> %val, + %shl = shl <2 x i64> %val, %trunc = trunc <2 x i64> %shl to <2 x i32> ret <2 x i32> %trunc } @@ -917,7 +913,7 @@ define <4 x i8> @wide_shuf(<4 x i32> %x) { ret <4 x i8> %trunc } -; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask +; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask define <4 x i8> @wide_splat1(<4 x i32> %x) { ; CHECK-LABEL: @wide_splat1( @@ -925,13 +921,13 @@ define <4 x i8> @wide_splat1(<4 x i32> %x) { ; CHECK-NEXT: [[TRUNC:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i8> [[TRUNC]] ; - %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> + %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> %trunc = trunc <4 x i32> %shuf to <4 x i8> ret <4 x i8> %trunc } ; Test weird types. -; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask +; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask define <3 x i31> @wide_splat2(<3 x i33> %x) { ; CHECK-LABEL: @wide_splat2( @@ -939,14 +935,14 @@ define <3 x i31> @wide_splat2(<3 x i33> %x) { ; CHECK-NEXT: [[TRUNC:%.*]] = shufflevector <3 x i31> [[TMP1]], <3 x i31> poison, <3 x i32> ; CHECK-NEXT: ret <3 x i31> [[TRUNC]] ; - %shuf = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> + %shuf = shufflevector <3 x i33> %x, <3 x i33> poison, <3 x i32> %trunc = trunc <3 x i33> %shuf to <3 x i31> ret <3 x i31> %trunc } ; FIXME: -; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask -; A mask with undef elements should still be considered a splat mask. +; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask +; A mask with poison elements should still be considered a splat mask. define <3 x i31> @wide_splat3(<3 x i33> %x) { ; CHECK-LABEL: @wide_splat3( @@ -954,7 +950,7 @@ define <3 x i31> @wide_splat3(<3 x i33> %x) { ; CHECK-NEXT: [[TRUNC:%.*]] = trunc <3 x i33> [[SHUF]] to <3 x i31> ; CHECK-NEXT: ret <3 x i31> [[TRUNC]] ; - %shuf = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> + %shuf = shufflevector <3 x i33> %x, <3 x i33> poison, <3 x i32> %trunc = trunc <3 x i33> %shuf to <3 x i31> ret <3 x i31> %trunc } diff --git a/llvm/test/Transforms/InstCombine/unsigned-mul-lack-of-overflow-check-via-udiv-of-allones.ll b/llvm/test/Transforms/InstCombine/unsigned-mul-lack-of-overflow-check-via-udiv-of-allones.ll index 1ffcfb4424e313..241d9cbcde3382 100644 --- a/llvm/test/Transforms/InstCombine/unsigned-mul-lack-of-overflow-check-via-udiv-of-allones.ll +++ b/llvm/test/Transforms/InstCombine/unsigned-mul-lack-of-overflow-check-via-udiv-of-allones.ll @@ -30,14 +30,14 @@ define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) { ret <2 x i1> %r } -define <3 x i1> @t2_vec_undef(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @t2_vec_undef( +define <3 x i1> @t2_vec_poison(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @t2_vec_poison( ; CHECK-NEXT: [[MUL:%.*]] = call { <3 x i8>, <3 x i1> } @llvm.umul.with.overflow.v3i8(<3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]]) ; CHECK-NEXT: [[MUL_OV:%.*]] = extractvalue { <3 x i8>, <3 x i1> } [[MUL]], 1 ; CHECK-NEXT: [[MUL_NOT_OV:%.*]] = xor <3 x i1> [[MUL_OV]], ; CHECK-NEXT: ret <3 x i1> [[MUL_NOT_OV]] ; - %t0 = udiv <3 x i8> , %x + %t0 = udiv <3 x i8> , %x %r = icmp uge <3 x i8> %t0, %y ret <3 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/unsigned-mul-overflow-check-via-udiv-of-allones.ll b/llvm/test/Transforms/InstCombine/unsigned-mul-overflow-check-via-udiv-of-allones.ll index 710a09f6e16a17..7eb08bdd6016c9 100644 --- a/llvm/test/Transforms/InstCombine/unsigned-mul-overflow-check-via-udiv-of-allones.ll +++ b/llvm/test/Transforms/InstCombine/unsigned-mul-overflow-check-via-udiv-of-allones.ll @@ -28,13 +28,13 @@ define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) { ret <2 x i1> %r } -define <3 x i1> @t2_vec_undef(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @t2_vec_undef( +define <3 x i1> @t2_vec_poison(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @t2_vec_poison( ; CHECK-NEXT: [[MUL:%.*]] = call { <3 x i8>, <3 x i1> } @llvm.umul.with.overflow.v3i8(<3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]]) ; CHECK-NEXT: [[MUL_OV:%.*]] = extractvalue { <3 x i8>, <3 x i1> } [[MUL]], 1 ; CHECK-NEXT: ret <3 x i1> [[MUL_OV]] ; - %t0 = udiv <3 x i8> , %x + %t0 = udiv <3 x i8> , %x %r = icmp ult <3 x i8> %t0, %y ret <3 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll b/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll index adacf3ce99b2f9..262942aa1219b8 100644 --- a/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll +++ b/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll @@ -203,20 +203,20 @@ define <2 x i32> @t4_vec(<2 x i64> %data, <2 x i32> %nbits) { ret <2 x i32> %signextended } -define <3 x i32> @t5_vec_undef(<3 x i64> %data, <3 x i32> %nbits) { -; CHECK-LABEL: @t5_vec_undef( -; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub <3 x i32> , [[NBITS:%.*]] +define <3 x i32> @t5_vec_poison(<3 x i64> %data, <3 x i32> %nbits) { +; CHECK-LABEL: @t5_vec_poison( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub <3 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext nneg <3 x i32> [[SKIP_HIGH]] to <3 x i64> ; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] ; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc <3 x i64> [[TMP1]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[SIGNEXTENDED]] ; - %skip_high = sub <3 x i32> , %nbits + %skip_high = sub <3 x i32> , %nbits %skip_high_wide = zext <3 x i32> %skip_high to <3 x i64> %extracted = lshr <3 x i64> %data, %skip_high_wide %extracted_narrow = trunc <3 x i64> %extracted to <3 x i32> - %num_high_bits_to_smear_narrow0 = sub <3 x i32> , %nbits - %num_high_bits_to_smear_narrow1 = sub <3 x i32> , %nbits + %num_high_bits_to_smear_narrow0 = sub <3 x i32> , %nbits + %num_high_bits_to_smear_narrow1 = sub <3 x i32> , %nbits %signbit_positioned = shl <3 x i32> %extracted_narrow, %num_high_bits_to_smear_narrow0 %signextended = ashr <3 x i32> %signbit_positioned, %num_high_bits_to_smear_narrow1 ret <3 x i32> %signextended diff --git a/llvm/test/Transforms/InstCombine/vec_sext.ll b/llvm/test/Transforms/InstCombine/vec_sext.ll index a880d5e5627254..9f5f957f494452 100644 --- a/llvm/test/Transforms/InstCombine/vec_sext.ll +++ b/llvm/test/Transforms/InstCombine/vec_sext.ll @@ -42,24 +42,24 @@ define <4 x i32> @vec_select_alternate_sign_bit_test(<4 x i32> %a, <4 x i32> %b) ret <4 x i32> %cond } -define <2 x i32> @is_negative_undef_elt(<2 x i32> %a) { -; CHECK-LABEL: @is_negative_undef_elt( +define <2 x i32> @is_negative_poison_elt(<2 x i32> %a) { +; CHECK-LABEL: @is_negative_poison_elt( ; CHECK-NEXT: [[A_LOBIT:%.*]] = ashr <2 x i32> [[A:%.*]], ; CHECK-NEXT: ret <2 x i32> [[A_LOBIT]] ; - %cmp = icmp slt <2 x i32> %a, + %cmp = icmp slt <2 x i32> %a, %sext = sext <2 x i1> %cmp to <2 x i32> ret <2 x i32> %sext } -define <2 x i32> @is_positive_undef_elt(<2 x i32> %a) { -; CHECK-LABEL: @is_positive_undef_elt( -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt <2 x i32> [[A:%.*]], +define <2 x i32> @is_positive_poison_elt(<2 x i32> %a) { +; CHECK-LABEL: @is_positive_poison_elt( +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt <2 x i32> [[A:%.*]], ; CHECK-NEXT: [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[SEXT]] ; - %cmp = icmp sgt <2 x i32> %a, + %cmp = icmp sgt <2 x i32> %a, %sext = sext <2 x i1> %cmp to <2 x i32> ret <2 x i32> %sext } diff --git a/llvm/test/Transforms/InstCombine/vector-casts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vector-casts-inseltpoison.ll index cf1b72fbcf3e1e..a87364600ba308 100644 --- a/llvm/test/Transforms/InstCombine/vector-casts-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/vector-casts-inseltpoison.ll @@ -26,26 +26,26 @@ define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) { ; This is trunc. -define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) { -; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt( +define <2 x i1> @and_cmp_is_trunc_even_with_poison_elt(<2 x i64> %a) { +; CHECK-LABEL: @and_cmp_is_trunc_even_with_poison_elt( ; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %t = and <2 x i64> %a, + %t = and <2 x i64> %a, %r = icmp ne <2 x i64> %t, zeroinitializer ret <2 x i1> %r } -; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete. +; TODO: This could be just 1 instruction (trunc), but our poison matching is incomplete. -define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) { -; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts( -; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], -; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], +define <2 x i1> @and_cmp_is_trunc_even_with_poison_elts(<2 x i64> %a) { +; CHECK-LABEL: @and_cmp_is_trunc_even_with_poison_elts( +; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %t = and <2 x i64> %a, - %r = icmp ne <2 x i64> %t, + %t = and <2 x i64> %a, + %r = icmp ne <2 x i64> %t, ret <2 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/vector-casts.ll b/llvm/test/Transforms/InstCombine/vector-casts.ll index 281fc5f6011ea7..fd2a4ffdfb7092 100644 --- a/llvm/test/Transforms/InstCombine/vector-casts.ll +++ b/llvm/test/Transforms/InstCombine/vector-casts.ll @@ -26,26 +26,26 @@ define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) { ; This is trunc. -define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) { -; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt( +define <2 x i1> @and_cmp_is_trunc_even_with_poison_elt(<2 x i64> %a) { +; CHECK-LABEL: @and_cmp_is_trunc_even_with_poison_elt( ; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %t = and <2 x i64> %a, + %t = and <2 x i64> %a, %r = icmp ne <2 x i64> %t, zeroinitializer ret <2 x i1> %r } -; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete. +; TODO: This could be just 1 instruction (trunc), but our poison matching is incomplete. -define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) { -; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts( -; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], -; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], +define <2 x i1> @and_cmp_is_trunc_even_with_poison_elts(<2 x i64> %a) { +; CHECK-LABEL: @and_cmp_is_trunc_even_with_poison_elts( +; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; - %t = and <2 x i64> %a, - %r = icmp ne <2 x i64> %t, + %t = and <2 x i64> %a, + %r = icmp ne <2 x i64> %t, ret <2 x i1> %r } diff --git a/llvm/test/Transforms/InstCombine/vector-urem.ll b/llvm/test/Transforms/InstCombine/vector-urem.ll index d5c77470a20f8e..627789a03ef6ca 100644 --- a/llvm/test/Transforms/InstCombine/vector-urem.ll +++ b/llvm/test/Transforms/InstCombine/vector-urem.ll @@ -19,11 +19,11 @@ define <4 x i32> @test_v4i32_const_pow2(<4 x i32> %a0) { ret <4 x i32> %1 } -define <4 x i32> @test_v4i32_const_pow2_undef(<4 x i32> %a0) { -; CHECK-LABEL: @test_v4i32_const_pow2_undef( +define <4 x i32> @test_v4i32_const_pow2_poison(<4 x i32> %a0) { +; CHECK-LABEL: @test_v4i32_const_pow2_poison( ; CHECK-NEXT: ret <4 x i32> poison ; - %1 = urem <4 x i32> %a0, + %1 = urem <4 x i32> %a0, ret <4 x i32> %1 } @@ -37,13 +37,13 @@ define <4 x i32> @test_v4i32_one(<4 x i32> %a0) { ret <4 x i32> %1 } -define <4 x i32> @test_v4i32_one_undef(<4 x i32> %a0) { -; CHECK-LABEL: @test_v4i32_one_undef( +define <4 x i32> @test_v4i32_one_poison(<4 x i32> %a0) { +; CHECK-LABEL: @test_v4i32_one_poison( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2]] ; - %1 = urem <4 x i32> , %a0 + %1 = urem <4 x i32> , %a0 ret <4 x i32> %1 } @@ -71,10 +71,10 @@ define <4 x i32> @test_v4i32_negconst(<4 x i32> %a0) { ret <4 x i32> %1 } -define <4 x i32> @test_v4i32_negconst_undef(<4 x i32> %a0) { -; CHECK-LABEL: @test_v4i32_negconst_undef( +define <4 x i32> @test_v4i32_negconst_poison(<4 x i32> %a0) { +; CHECK-LABEL: @test_v4i32_negconst_poison( ; CHECK-NEXT: ret <4 x i32> poison ; - %1 = urem <4 x i32> %a0, + %1 = urem <4 x i32> %a0, ret <4 x i32> %1 } diff --git a/llvm/test/Transforms/InstCombine/vector-xor.ll b/llvm/test/Transforms/InstCombine/vector-xor.ll index 171dd6e35b4e11..ee593b5d15e8e3 100644 --- a/llvm/test/Transforms/InstCombine/vector-xor.ll +++ b/llvm/test/Transforms/InstCombine/vector-xor.ll @@ -53,14 +53,14 @@ define <4 x i32> @test_v4i32_xor_bswap_const(<4 x i32> %a0) { ret <4 x i32> %2 } -define <4 x i32> @test_v4i32_xor_bswap_const_undef(<4 x i32> %a0) { -; CHECK-LABEL: @test_v4i32_xor_bswap_const_undef( +define <4 x i32> @test_v4i32_xor_bswap_const_poison(<4 x i32> %a0) { +; CHECK-LABEL: @test_v4i32_xor_bswap_const_poison( ; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[A0:%.*]]) -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], ; CHECK-NEXT: ret <4 x i32> [[TMP2]] ; %1 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a0) - %2 = xor <4 x i32> %1, + %2 = xor <4 x i32> %1, ret <4 x i32> %2 } @@ -105,14 +105,14 @@ define <4 x i32> @test_v4i32_not_ashr_not(<4 x i32> %x, <4 x i32> %y) { ret <4 x i32> %3 } -define <4 x i32> @test_v4i32_not_ashr_not_undef(<4 x i32> %x, <4 x i32> %y) { -; CHECK-LABEL: @test_v4i32_not_ashr_not_undef( +define <4 x i32> @test_v4i32_not_ashr_not_poison(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: @test_v4i32_not_ashr_not_poison( ; CHECK-NEXT: [[DOTNOT:%.*]] = ashr <4 x i32> [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: ret <4 x i32> [[DOTNOT]] ; - %1 = xor <4 x i32> , %x + %1 = xor <4 x i32> , %x %2 = ashr <4 x i32> %1, %y - %3 = xor <4 x i32> , %2 + %3 = xor <4 x i32> , %2 ret <4 x i32> %3 } @@ -138,13 +138,13 @@ define <4 x i32> @test_v4i32_not_ashr_negative_const(<4 x i32> %a0) { ret <4 x i32> %2 } -define <4 x i32> @test_v4i32_not_ashr_negative_const_undef(<4 x i32> %a0) { -; CHECK-LABEL: @test_v4i32_not_ashr_negative_const_undef( +define <4 x i32> @test_v4i32_not_ashr_negative_const_poison(<4 x i32> %a0) { +; CHECK-LABEL: @test_v4i32_not_ashr_negative_const_poison( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> , [[A0:%.*]] ; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; - %1 = ashr <4 x i32> , %a0 - %2 = xor <4 x i32> , %1 + %1 = ashr <4 x i32> , %a0 + %2 = xor <4 x i32> , %1 ret <4 x i32> %2 } @@ -170,13 +170,13 @@ define <4 x i32> @test_v4i32_not_lshr_nonnegative_const(<4 x i32> %a0) { ret <4 x i32> %2 } -define <4 x i32> @test_v4i32_not_lshr_nonnegative_const_undef(<4 x i32> %a0) { -; CHECK-LABEL: @test_v4i32_not_lshr_nonnegative_const_undef( +define <4 x i32> @test_v4i32_not_lshr_nonnegative_const_poison(<4 x i32> %a0) { +; CHECK-LABEL: @test_v4i32_not_lshr_nonnegative_const_poison( ; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i32> , [[A0:%.*]] ; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; - %1 = lshr <4 x i32> , %a0 - %2 = xor <4 x i32> , %1 + %1 = lshr <4 x i32> , %a0 + %2 = xor <4 x i32> , %1 ret <4 x i32> %2 } @@ -202,13 +202,13 @@ define <4 x i32> @test_v4i32_not_sub_const(<4 x i32> %a0) { ret <4 x i32> %2 } -define <4 x i32> @test_v4i32_not_sub_const_undef(<4 x i32> %a0) { -; CHECK-LABEL: @test_v4i32_not_sub_const_undef( -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A0:%.*]], +define <4 x i32> @test_v4i32_not_sub_const_poison(<4 x i32> %a0) { +; CHECK-LABEL: @test_v4i32_not_sub_const_poison( +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A0:%.*]], ; CHECK-NEXT: ret <4 x i32> [[TMP1]] ; - %1 = sub <4 x i32> , %a0 - %2 = xor <4 x i32> , %1 + %1 = sub <4 x i32> , %a0 + %2 = xor <4 x i32> , %1 ret <4 x i32> %2 } @@ -235,14 +235,14 @@ define <4 x i32> @test_v4i32_xor_signmask_sub_const(<4 x i32> %a0) { ret <4 x i32> %2 } -define <4 x i32> @test_v4i32_xor_signmask_sub_const_undef(<4 x i32> %a0) { -; CHECK-LABEL: @test_v4i32_xor_signmask_sub_const_undef( -; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> , [[A0:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], +define <4 x i32> @test_v4i32_xor_signmask_sub_const_poison(<4 x i32> %a0) { +; CHECK-LABEL: @test_v4i32_xor_signmask_sub_const_poison( +; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> , [[A0:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], ; CHECK-NEXT: ret <4 x i32> [[TMP2]] ; - %1 = sub <4 x i32> , %a0 - %2 = xor <4 x i32> , %1 + %1 = sub <4 x i32> , %a0 + %2 = xor <4 x i32> , %1 ret <4 x i32> %2 } @@ -269,13 +269,13 @@ define <4 x i32> @test_v4i32_xor_signmask_add_const(<4 x i32> %a0) { ret <4 x i32> %2 } -define <4 x i32> @test_v4i32_xor_signmask_add_const_undef(<4 x i32> %a0) { -; CHECK-LABEL: @test_v4i32_xor_signmask_add_const_undef( -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A0:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], +define <4 x i32> @test_v4i32_xor_signmask_add_const_poison(<4 x i32> %a0) { +; CHECK-LABEL: @test_v4i32_xor_signmask_add_const_poison( +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A0:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], ; CHECK-NEXT: ret <4 x i32> [[TMP2]] ; - %1 = add <4 x i32> , %a0 - %2 = xor <4 x i32> , %1 + %1 = add <4 x i32> , %a0 + %2 = xor <4 x i32> , %1 ret <4 x i32> %2 } diff --git a/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll b/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll index 7fed952a7ff7e8..12739b5686a0ad 100644 --- a/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll +++ b/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll @@ -126,13 +126,13 @@ define <2 x i64> @zext_negate_vec(<2 x i1> %A) { ret <2 x i64> %sub } -define <2 x i64> @zext_negate_vec_undef_elt(<2 x i1> %A) { -; CHECK-LABEL: @zext_negate_vec_undef_elt( +define <2 x i64> @zext_negate_vec_poison_elt(<2 x i1> %A) { +; CHECK-LABEL: @zext_negate_vec_poison_elt( ; CHECK-NEXT: [[EXT_NEG:%.*]] = sext <2 x i1> [[A:%.*]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[EXT_NEG]] ; %ext = zext <2 x i1> %A to <2 x i64> - %sub = sub <2 x i64> , %ext + %sub = sub <2 x i64> , %ext ret <2 x i64> %sub } @@ -169,13 +169,13 @@ define <2 x i64> @zext_sub_const_vec(<2 x i1> %A) { ret <2 x i64> %sub } -define <2 x i64> @zext_sub_const_vec_undef_elt(<2 x i1> %A) { -; CHECK-LABEL: @zext_sub_const_vec_undef_elt( -; CHECK-NEXT: [[SUB:%.*]] = select <2 x i1> [[A:%.*]], <2 x i64> , <2 x i64> +define <2 x i64> @zext_sub_const_vec_poison_elt(<2 x i1> %A) { +; CHECK-LABEL: @zext_sub_const_vec_poison_elt( +; CHECK-NEXT: [[SUB:%.*]] = select <2 x i1> [[A:%.*]], <2 x i64> , <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[SUB]] ; %ext = zext <2 x i1> %A to <2 x i64> - %sub = sub <2 x i64> , %ext + %sub = sub <2 x i64> , %ext ret <2 x i64> %sub } @@ -212,13 +212,13 @@ define <2 x i64> @sext_negate_vec(<2 x i1> %A) { ret <2 x i64> %sub } -define <2 x i64> @sext_negate_vec_undef_elt(<2 x i1> %A) { -; CHECK-LABEL: @sext_negate_vec_undef_elt( +define <2 x i64> @sext_negate_vec_poison_elt(<2 x i1> %A) { +; CHECK-LABEL: @sext_negate_vec_poison_elt( ; CHECK-NEXT: [[EXT_NEG:%.*]] = zext <2 x i1> [[A:%.*]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[EXT_NEG]] ; %ext = sext <2 x i1> %A to <2 x i64> - %sub = sub <2 x i64> , %ext + %sub = sub <2 x i64> , %ext ret <2 x i64> %sub } @@ -255,13 +255,13 @@ define <2 x i64> @sext_sub_const_vec(<2 x i1> %A) { ret <2 x i64> %sub } -define <2 x i64> @sext_sub_const_vec_undef_elt(<2 x i1> %A) { -; CHECK-LABEL: @sext_sub_const_vec_undef_elt( -; CHECK-NEXT: [[SUB:%.*]] = select <2 x i1> [[A:%.*]], <2 x i64> , <2 x i64> +define <2 x i64> @sext_sub_const_vec_poison_elt(<2 x i1> %A) { +; CHECK-LABEL: @sext_sub_const_vec_poison_elt( +; CHECK-NEXT: [[SUB:%.*]] = select <2 x i1> [[A:%.*]], <2 x i64> , <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[SUB]] ; %ext = sext <2 x i1> %A to <2 x i64> - %sub = sub <2 x i64> , %ext + %sub = sub <2 x i64> , %ext ret <2 x i64> %sub } diff --git a/llvm/test/Transforms/InstSimplify/AndOrXor.ll b/llvm/test/Transforms/InstSimplify/AndOrXor.ll index 494b6bcd2b66d5..2e3a6052242038 100644 --- a/llvm/test/Transforms/InstSimplify/AndOrXor.ll +++ b/llvm/test/Transforms/InstSimplify/AndOrXor.ll @@ -12,11 +12,11 @@ define i8 @and0(i8 %x) { ret i8 %r } -define <2 x i8> @and0_vec_undef_elt(<2 x i8> %x) { -; CHECK-LABEL: @and0_vec_undef_elt( +define <2 x i8> @and0_vec_poison_elt(<2 x i8> %x) { +; CHECK-LABEL: @and0_vec_poison_elt( ; CHECK-NEXT: ret <2 x i8> zeroinitializer ; - %r = and <2 x i8> %x, + %r = and <2 x i8> %x, ret <2 x i8> %r } @@ -31,14 +31,14 @@ define <2 x i32> @add_nsw_signbit(<2 x i32> %x) { ret <2 x i32> %z } -; Undef elements in either constant vector are ok. +; Poison elements in either constant vector are ok. -define <2 x i32> @add_nsw_signbit_undef(<2 x i32> %x) { -; CHECK-LABEL: @add_nsw_signbit_undef( +define <2 x i32> @add_nsw_signbit_poison(<2 x i32> %x) { +; CHECK-LABEL: @add_nsw_signbit_poison( ; CHECK-NEXT: ret <2 x i32> [[X:%.*]] ; - %y = xor <2 x i32> %x, - %z = add nsw <2 x i32> %y, + %y = xor <2 x i32> %x, + %z = add nsw <2 x i32> %y, ret <2 x i32> %z } @@ -53,14 +53,14 @@ define <2 x i5> @add_nuw_signbit(<2 x i5> %x) { ret <2 x i5> %z } -; Undef elements in either constant vector are ok. +; Poison elements in either constant vector are ok. -define <2 x i5> @add_nuw_signbit_undef(<2 x i5> %x) { -; CHECK-LABEL: @add_nuw_signbit_undef( +define <2 x i5> @add_nuw_signbit_poison(<2 x i5> %x) { +; CHECK-LABEL: @add_nuw_signbit_poison( ; CHECK-NEXT: ret <2 x i5> [[X:%.*]] ; - %y = xor <2 x i5> %x, - %z = add nuw <2 x i5> %y, + %y = xor <2 x i5> %x, + %z = add nuw <2 x i5> %y, ret <2 x i5> %z } @@ -584,7 +584,7 @@ define <2 x i32> @or_xor_andn_commute2(<2 x i32> %a, <2 x i32> %b) { ; CHECK-NEXT: ret <2 x i32> [[XOR]] ; %xor = xor <2 x i32> %a, %b - %neg = xor <2 x i32> %b, + %neg = xor <2 x i32> %b, %and = and <2 x i32> %a, %neg %or = or <2 x i32> %xor, %and ret <2 x i32> %or @@ -708,15 +708,13 @@ define <2 x i32> @or_xorn_and_commute2_undef(<2 x i32> %a, <2 x i32> %b) { ret <2 x i32> %or } -; TODO: Unlike the above test, this is safe to fold. +; Unlike the above test, this is safe to fold. define <2 x i32> @or_xorn_and_commute2_poison(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @or_xorn_and_commute2_poison( ; CHECK-NEXT: [[NEGA:%.*]] = xor <2 x i32> [[A:%.*]], -; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[B:%.*]], [[A]] -; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i32> [[B]], [[NEGA]] -; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[XOR]], [[AND]] -; CHECK-NEXT: ret <2 x i32> [[OR]] +; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i32> [[B:%.*]], [[NEGA]] +; CHECK-NEXT: ret <2 x i32> [[XOR]] ; %nega = xor <2 x i32> %a, %and = and <2 x i32> %b, %a diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll index 52c207a2760468..c6f6b65f89dc2b 100644 --- a/llvm/test/Transforms/InstSimplify/call.ll +++ b/llvm/test/Transforms/InstSimplify/call.ll @@ -976,7 +976,7 @@ define <2 x i8> @fshr_zero_vec(<2 x i8> %shamt) { ; CHECK-LABEL: @fshr_zero_vec( ; CHECK-NEXT: ret <2 x i8> zeroinitializer ; - %r = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> zeroinitializer, <2 x i8> , <2 x i8> %shamt) + %r = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> zeroinitializer, <2 x i8> , <2 x i8> %shamt) ret <2 x i8> %r } @@ -984,7 +984,7 @@ define <2 x i7> @fshl_ones_vec(<2 x i7> %shamt) { ; CHECK-LABEL: @fshl_ones_vec( ; CHECK-NEXT: ret <2 x i7> ; - %r = call <2 x i7> @llvm.fshl.v2i7(<2 x i7> , <2 x i7> , <2 x i7> %shamt) + %r = call <2 x i7> @llvm.fshl.v2i7(<2 x i7> , <2 x i7> , <2 x i7> %shamt) ret <2 x i7> %r } @@ -1466,7 +1466,7 @@ define <3 x i33> @cttz_shl1_vec(<3 x i33> %x) { ; CHECK-LABEL: @cttz_shl1_vec( ; CHECK-NEXT: ret <3 x i33> [[X:%.*]] ; - %s = shl <3 x i33> , %x + %s = shl <3 x i33> , %x %r = call <3 x i33> @llvm.cttz.v3i33(<3 x i33> %s, i1 false) ret <3 x i33> %r } @@ -1509,7 +1509,7 @@ define <3 x i33> @ctlz_lshr_sign_bit_vec(<3 x i33> %x) { ; CHECK-LABEL: @ctlz_lshr_sign_bit_vec( ; CHECK-NEXT: ret <3 x i33> [[X:%.*]] ; - %s = lshr <3 x i33> , %x + %s = lshr <3 x i33> , %x %r = call <3 x i33> @llvm.ctlz.v3i33(<3 x i33> %s, i1 false) ret <3 x i33> %r } @@ -1549,7 +1549,7 @@ define <3 x i33> @ctlz_ashr_sign_bit_vec(<3 x i33> %x) { ; CHECK-LABEL: @ctlz_ashr_sign_bit_vec( ; CHECK-NEXT: ret <3 x i33> zeroinitializer ; - %s = ashr <3 x i33> , %x + %s = ashr <3 x i33> , %x %r = call <3 x i33> @llvm.ctlz.v3i33(<3 x i33> %s, i1 true) ret <3 x i33> %r } diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll index 1e90f0edbd8003..724912d90bd861 100644 --- a/llvm/test/Transforms/InstSimplify/compare.ll +++ b/llvm/test/Transforms/InstSimplify/compare.ll @@ -1659,21 +1659,21 @@ define <2 x i1> @icmp_shl_1_ugt_signmask(<2 x i8> %V) { ret <2 x i1> %cmp } -define <2 x i1> @icmp_shl_1_ugt_signmask_undef(<2 x i8> %V) { -; CHECK-LABEL: @icmp_shl_1_ugt_signmask_undef( +define <2 x i1> @icmp_shl_1_ugt_signmask_poison(<2 x i8> %V) { +; CHECK-LABEL: @icmp_shl_1_ugt_signmask_poison( ; CHECK-NEXT: ret <2 x i1> zeroinitializer ; %shl = shl <2 x i8> , %V - %cmp = icmp ugt <2 x i8> %shl, + %cmp = icmp ugt <2 x i8> %shl, ret <2 x i1> %cmp } -define <2 x i1> @icmp_shl_1_ugt_signmask_undef2(<2 x i8> %V) { -; CHECK-LABEL: @icmp_shl_1_ugt_signmask_undef2( +define <2 x i1> @icmp_shl_1_ugt_signmask_poison2(<2 x i8> %V) { +; CHECK-LABEL: @icmp_shl_1_ugt_signmask_poison2( ; CHECK-NEXT: ret <2 x i1> zeroinitializer ; - %shl = shl <2 x i8> , %V - %cmp = icmp ugt <2 x i8> %shl, + %shl = shl <2 x i8> , %V + %cmp = icmp ugt <2 x i8> %shl, ret <2 x i1> %cmp } @@ -1695,21 +1695,21 @@ define <2 x i1> @icmp_shl_1_ule_signmask(<2 x i8> %V) { ret <2 x i1> %cmp } -define <2 x i1> @icmp_shl_1_ule_signmask_undef(<2 x i8> %V) { -; CHECK-LABEL: @icmp_shl_1_ule_signmask_undef( +define <2 x i1> @icmp_shl_1_ule_signmask_poison(<2 x i8> %V) { +; CHECK-LABEL: @icmp_shl_1_ule_signmask_poison( ; CHECK-NEXT: ret <2 x i1> ; %shl = shl <2 x i8> , %V - %cmp = icmp ule <2 x i8> %shl, + %cmp = icmp ule <2 x i8> %shl, ret <2 x i1> %cmp } -define <2 x i1> @icmp_shl_1_ule_signmask_undef2(<2 x i8> %V) { -; CHECK-LABEL: @icmp_shl_1_ule_signmask_undef2( +define <2 x i1> @icmp_shl_1_ule_signmask_poison2(<2 x i8> %V) { +; CHECK-LABEL: @icmp_shl_1_ule_signmask_poison2( ; CHECK-NEXT: ret <2 x i1> ; - %shl = shl <2 x i8> , %V - %cmp = icmp ule <2 x i8> %shl, + %shl = shl <2 x i8> , %V + %cmp = icmp ule <2 x i8> %shl, ret <2 x i1> %cmp } @@ -1731,12 +1731,12 @@ define <2 x i1> @shl_1_cmp_eq_nonpow2_splat(<2 x i32> %x) { ret <2 x i1> %c } -define <2 x i1> @shl_1_cmp_eq_nonpow2_splat_undef(<2 x i32> %x) { -; CHECK-LABEL: @shl_1_cmp_eq_nonpow2_splat_undef( +define <2 x i1> @shl_1_cmp_eq_nonpow2_splat_poison(<2 x i32> %x) { +; CHECK-LABEL: @shl_1_cmp_eq_nonpow2_splat_poison( ; CHECK-NEXT: ret <2 x i1> zeroinitializer ; %s = shl <2 x i32> , %x - %c = icmp eq <2 x i32> %s, + %c = icmp eq <2 x i32> %s, ret <2 x i1> %c } @@ -1758,12 +1758,12 @@ define <2 x i1> @shl_1_cmp_ne_nonpow2_splat(<2 x i32> %x) { ret <2 x i1> %c } -define <2 x i1> @shl_1_cmp_ne_nonpow2_splat_undef(<2 x i32> %x) { -; CHECK-LABEL: @shl_1_cmp_ne_nonpow2_splat_undef( +define <2 x i1> @shl_1_cmp_ne_nonpow2_splat_poison(<2 x i32> %x) { +; CHECK-LABEL: @shl_1_cmp_ne_nonpow2_splat_poison( ; CHECK-NEXT: ret <2 x i1> ; - %s = shl <2 x i32> , %x - %c = icmp ne <2 x i32> %s, + %s = shl <2 x i32> , %x + %c = icmp ne <2 x i32> %s, ret <2 x i1> %c } @@ -1776,12 +1776,12 @@ define i1 @shl_pow2_cmp_eq_nonpow2(i32 %x) { ret i1 %c } -define <2 x i1> @shl_pow21_cmp_ne_nonpow2_splat_undef(<2 x i32> %x) { -; CHECK-LABEL: @shl_pow21_cmp_ne_nonpow2_splat_undef( +define <2 x i1> @shl_pow21_cmp_ne_nonpow2_splat_poison(<2 x i32> %x) { +; CHECK-LABEL: @shl_pow21_cmp_ne_nonpow2_splat_poison( ; CHECK-NEXT: ret <2 x i1> ; - %s = shl <2 x i32> , %x - %c = icmp ne <2 x i32> %s, + %s = shl <2 x i32> , %x + %c = icmp ne <2 x i32> %s, ret <2 x i1> %c } @@ -1820,12 +1820,12 @@ define i1 @shl_pow2_cmp_eq_zero_nuw(i32 %x) { ret i1 %c } -define <2 x i1> @shl_pow2_cmp_ne_zero_nuw_splat_undef(<2 x i32> %x) { -; CHECK-LABEL: @shl_pow2_cmp_ne_zero_nuw_splat_undef( +define <2 x i1> @shl_pow2_cmp_ne_zero_nuw_splat_poison(<2 x i32> %x) { +; CHECK-LABEL: @shl_pow2_cmp_ne_zero_nuw_splat_poison( ; CHECK-NEXT: ret <2 x i1> ; - %s = shl nuw <2 x i32> , %x - %c = icmp ne <2 x i32> %s, + %s = shl nuw <2 x i32> , %x + %c = icmp ne <2 x i32> %s, ret <2 x i1> %c } @@ -1838,12 +1838,12 @@ define i1 @shl_pow2_cmp_ne_zero_nsw(i32 %x) { ret i1 %c } -define <2 x i1> @shl_pow2_cmp_eq_zero_nsw_splat_undef(<2 x i32> %x) { -; CHECK-LABEL: @shl_pow2_cmp_eq_zero_nsw_splat_undef( +define <2 x i1> @shl_pow2_cmp_eq_zero_nsw_splat_poison(<2 x i32> %x) { +; CHECK-LABEL: @shl_pow2_cmp_eq_zero_nsw_splat_poison( ; CHECK-NEXT: ret <2 x i1> zeroinitializer ; - %s = shl nsw <2 x i32> , %x - %c = icmp eq <2 x i32> %s, + %s = shl nsw <2 x i32> , %x + %c = icmp eq <2 x i32> %s, ret <2 x i1> %c } diff --git a/llvm/test/Transforms/InstSimplify/constantfold-add-nuw-allones-to-allones.ll b/llvm/test/Transforms/InstSimplify/constantfold-add-nuw-allones-to-allones.ll index 7c9d9a9e2c7ce7..92d6cc30d6248e 100644 --- a/llvm/test/Transforms/InstSimplify/constantfold-add-nuw-allones-to-allones.ll +++ b/llvm/test/Transforms/InstSimplify/constantfold-add-nuw-allones-to-allones.ll @@ -63,11 +63,11 @@ define <2 x i8> @add_vec(<2 x i8> %x) { ret <2 x i8> %ret } -define <3 x i8> @add_vec_undef(<3 x i8> %x) { -; CHECK-LABEL: @add_vec_undef( -; CHECK-NEXT: ret <3 x i8> +define <3 x i8> @add_vec_poison(<3 x i8> %x) { +; CHECK-LABEL: @add_vec_poison( +; CHECK-NEXT: ret <3 x i8> ; - %ret = add nuw <3 x i8> %x, + %ret = add nuw <3 x i8> %x, ret <3 x i8> %ret } diff --git a/llvm/test/Transforms/InstSimplify/constantfold-shl-nuw-C-to-C.ll b/llvm/test/Transforms/InstSimplify/constantfold-shl-nuw-C-to-C.ll index b5b5773fee538e..3f4a08807a4b41 100644 --- a/llvm/test/Transforms/InstSimplify/constantfold-shl-nuw-C-to-C.ll +++ b/llvm/test/Transforms/InstSimplify/constantfold-shl-nuw-C-to-C.ll @@ -78,11 +78,11 @@ define <2 x i8> @shl_vec(<2 x i8> %x) { ret <2 x i8> %ret } -define <3 x i8> @shl_vec_undef(<3 x i8> %x) { -; CHECK-LABEL: @shl_vec_undef( -; CHECK-NEXT: ret <3 x i8> +define <3 x i8> @shl_vec_poison(<3 x i8> %x) { +; CHECK-LABEL: @shl_vec_poison( +; CHECK-NEXT: ret <3 x i8> ; - %ret = shl nuw <3 x i8> , %x + %ret = shl nuw <3 x i8> , %x ret <3 x i8> %ret } diff --git a/llvm/test/Transforms/InstSimplify/div.ll b/llvm/test/Transforms/InstSimplify/div.ll index e13b6f139bcf53..5ca2e8837b924b 100644 --- a/llvm/test/Transforms/InstSimplify/div.ll +++ b/llvm/test/Transforms/InstSimplify/div.ll @@ -17,11 +17,11 @@ define <2 x i32> @zero_dividend_vector(<2 x i32> %A) { ret <2 x i32> %B } -define <2 x i32> @zero_dividend_vector_undef_elt(<2 x i32> %A) { -; CHECK-LABEL: @zero_dividend_vector_undef_elt( +define <2 x i32> @zero_dividend_vector_poison_elt(<2 x i32> %A) { +; CHECK-LABEL: @zero_dividend_vector_poison_elt( ; CHECK-NEXT: ret <2 x i32> zeroinitializer ; - %B = sdiv <2 x i32> , %A + %B = sdiv <2 x i32> , %A ret <2 x i32> %B } @@ -59,23 +59,23 @@ define <2 x i8> @udiv_zero_elt_vec(<2 x i8> %x) { ret <2 x i8> %div } -define <2 x i8> @sdiv_undef_elt_vec(<2 x i8> %x) { -; CHECK-LABEL: @sdiv_undef_elt_vec( +define <2 x i8> @sdiv_poison_elt_vec(<2 x i8> %x) { +; CHECK-LABEL: @sdiv_poison_elt_vec( ; CHECK-NEXT: ret <2 x i8> poison ; - %div = sdiv <2 x i8> %x, + %div = sdiv <2 x i8> %x, ret <2 x i8> %div } -define <2 x i8> @udiv_undef_elt_vec(<2 x i8> %x) { -; CHECK-LABEL: @udiv_undef_elt_vec( +define <2 x i8> @udiv_poison_elt_vec(<2 x i8> %x) { +; CHECK-LABEL: @udiv_poison_elt_vec( ; CHECK-NEXT: ret <2 x i8> poison ; - %div = udiv <2 x i8> %x, + %div = udiv <2 x i8> %x, ret <2 x i8> %div } -; Division-by-zero is undef. UB in any vector lane means the whole op is undef. +; Division-by-zero is poison. UB in any vector lane means the whole op is poison. ; Thus, we can simplify this: if any element of 'y' is 0, we can do anything. ; Therefore, assume that all elements of 'y' must be 1. diff --git a/llvm/test/Transforms/InstSimplify/fast-math-strictfp.ll b/llvm/test/Transforms/InstSimplify/fast-math-strictfp.ll index 4938987baccc24..b1d772890aff83 100644 --- a/llvm/test/Transforms/InstSimplify/fast-math-strictfp.ll +++ b/llvm/test/Transforms/InstSimplify/fast-math-strictfp.ll @@ -18,11 +18,11 @@ define float @mul_zero_2(float %a) #0 { ret float %b } -define <2 x float> @mul_zero_nsz_nnan_vec_undef(<2 x float> %a) #0 { -; CHECK-LABEL: @mul_zero_nsz_nnan_vec_undef( +define <2 x float> @mul_zero_nsz_nnan_vec_poison(<2 x float> %a) #0 { +; CHECK-LABEL: @mul_zero_nsz_nnan_vec_poison( ; CHECK-NEXT: ret <2 x float> zeroinitializer ; - %b = call nsz nnan <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %a, <2 x float>, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %b = call nsz nnan <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %a, <2 x float>, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %b } @@ -98,13 +98,13 @@ define <2 x float> @fadd_unary_fnegx_commute_vec(<2 x float> %x) #0 { ret <2 x float> %r } -define <2 x float> @fadd_fnegx_commute_vec_undef(<2 x float> %x) #0 { -; CHECK-LABEL: @fadd_fnegx_commute_vec_undef( -; CHECK-NEXT: [[NEGX:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[X:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") +define <2 x float> @fadd_fnegx_commute_vec_poison(<2 x float> %x) #0 { +; CHECK-LABEL: @fadd_fnegx_commute_vec_poison( +; CHECK-NEXT: [[NEGX:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[X:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: [[R:%.*]] = call nnan <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[X]], <2 x float> [[NEGX]], metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: ret <2 x float> [[R]] ; - %negx = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %negx = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore") %r = call nnan <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %negx, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %r } @@ -240,34 +240,34 @@ define float @fneg_x(float %a) #0 { ret float %ret } -define <2 x float> @fsub_0_0_x_vec_undef1(<2 x float> %a) #0 { -; CHECK-LABEL: @fsub_0_0_x_vec_undef1( -; CHECK-NEXT: [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") +define <2 x float> @fsub_0_0_x_vec_poison1(<2 x float> %a) #0 { +; CHECK-LABEL: @fsub_0_0_x_vec_poison1( +; CHECK-NEXT: [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: [[RET:%.*]] = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> zeroinitializer, <2 x float> [[T1]], metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: ret <2 x float> [[RET]] ; - %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore") %ret = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> zeroinitializer, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %ret } -define <2 x float> @fneg_x_vec_undef1(<2 x float> %a) #0 { -; CHECK-LABEL: @fneg_x_vec_undef1( +define <2 x float> @fneg_x_vec_poison1(<2 x float> %a) #0 { +; CHECK-LABEL: @fneg_x_vec_poison1( ; CHECK-NEXT: ret <2 x float> [[A:%.*]] ; %t1 = fneg <2 x float> %a - %ret = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %ret = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %ret } -define <2 x float> @fsub_0_0_x_vec_undef2(<2 x float> %a) #0 { -; CHECK-LABEL: @fsub_0_0_x_vec_undef2( +define <2 x float> @fsub_0_0_x_vec_poison2(<2 x float> %a) #0 { +; CHECK-LABEL: @fsub_0_0_x_vec_poison2( ; CHECK-NEXT: [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> zeroinitializer, <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") -; CHECK-NEXT: [[RET:%.*]] = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[T1]], metadata !"round.tonearest", metadata !"fpexcept.ignore") +; CHECK-NEXT: [[RET:%.*]] = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[T1]], metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: ret <2 x float> [[RET]] ; %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> zeroinitializer, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore") - %ret = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %ret = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %ret } @@ -281,11 +281,11 @@ define <2 x float> @fadd_zero_nsz_vec(<2 x float> %x) #0 { ret <2 x float> %r } -define <2 x float> @fadd_zero_nsz_vec_undef(<2 x float> %x) #0 { -; CHECK-LABEL: @fadd_zero_nsz_vec_undef( +define <2 x float> @fadd_zero_nsz_vec_poison(<2 x float> %x) #0 { +; CHECK-LABEL: @fadd_zero_nsz_vec_poison( ; CHECK-NEXT: ret <2 x float> [[X:%.*]] ; - %r = call nsz <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> , metadata !"round.tonearest", metadata !"fpexcept.ignore") + %r = call nsz <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> , metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %r } @@ -375,11 +375,11 @@ define double @fdiv_zero_by_x(double %x) #0 { ret double %r } -define <2 x double> @fdiv_zero_by_x_vec_undef(<2 x double> %x) #0 { -; CHECK-LABEL: @fdiv_zero_by_x_vec_undef( +define <2 x double> @fdiv_zero_by_x_vec_poison(<2 x double> %x) #0 { +; CHECK-LABEL: @fdiv_zero_by_x_vec_poison( ; CHECK-NEXT: ret <2 x double> zeroinitializer ; - %r = call nnan nsz <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> , <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %r = call nnan nsz <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> , <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %r } @@ -394,11 +394,11 @@ define double @frem_zero_by_x(double %x) #0 { ret double %r } -define <2 x double> @frem_poszero_by_x_vec_undef(<2 x double> %x) #0 { -; CHECK-LABEL: @frem_poszero_by_x_vec_undef( +define <2 x double> @frem_poszero_by_x_vec_poison(<2 x double> %x) #0 { +; CHECK-LABEL: @frem_poszero_by_x_vec_poison( ; CHECK-NEXT: ret <2 x double> zeroinitializer ; - %r = call nnan <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double> , <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %r = call nnan <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double> , <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %r } @@ -413,11 +413,11 @@ define double @frem_negzero_by_x(double %x) #0 { ret double %r } -define <2 x double> @frem_negzero_by_x_vec_undef(<2 x double> %x) #0 { -; CHECK-LABEL: @frem_negzero_by_x_vec_undef( +define <2 x double> @frem_negzero_by_x_vec_poison(<2 x double> %x) #0 { +; CHECK-LABEL: @frem_negzero_by_x_vec_poison( ; CHECK-NEXT: ret <2 x double> ; - %r = call nnan <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double> , <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %r = call nnan <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double> , <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x double> %r } @@ -493,13 +493,13 @@ define float @fdiv_neg_swapped2(float %f) #0 { ret float %div } -define <2 x float> @fdiv_neg_vec_undef_elt(<2 x float> %f) #0 { -; CHECK-LABEL: @fdiv_neg_vec_undef_elt( -; CHECK-NEXT: [[NEG:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[F:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") +define <2 x float> @fdiv_neg_vec_poison_elt(<2 x float> %f) #0 { +; CHECK-LABEL: @fdiv_neg_vec_poison_elt( +; CHECK-NEXT: [[NEG:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[F:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: [[DIV:%.*]] = call nnan <2 x float> @llvm.experimental.constrained.fdiv.v2f32(<2 x float> [[F]], <2 x float> [[NEG]], metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: ret <2 x float> [[DIV]] ; - %neg = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> %f, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %neg = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> %f, metadata !"round.tonearest", metadata !"fpexcept.ignore") %div = call nnan <2 x float> @llvm.experimental.constrained.fdiv.v2f32(<2 x float> %f, <2 x float> %neg, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %div } diff --git a/llvm/test/Transforms/InstSimplify/fast-math.ll b/llvm/test/Transforms/InstSimplify/fast-math.ll index d1818e6346d7a3..287f30b162f804 100644 --- a/llvm/test/Transforms/InstSimplify/fast-math.ll +++ b/llvm/test/Transforms/InstSimplify/fast-math.ll @@ -18,11 +18,11 @@ define float @mul_zero_2(float %a) { ret float %b } -define <2 x float> @mul_zero_nsz_nnan_vec_undef(<2 x float> %a) { -; CHECK-LABEL: @mul_zero_nsz_nnan_vec_undef( +define <2 x float> @mul_zero_nsz_nnan_vec_poison(<2 x float> %a) { +; CHECK-LABEL: @mul_zero_nsz_nnan_vec_poison( ; CHECK-NEXT: ret <2 x float> zeroinitializer ; - %b = fmul nsz nnan <2 x float> %a, + %b = fmul nsz nnan <2 x float> %a, ret <2 x float> %b } @@ -94,11 +94,11 @@ define <2 x float> @fadd_unary_fnegx_commute_vec(<2 x float> %x) { ret <2 x float> %r } -define <2 x float> @fadd_fnegx_commute_vec_undef(<2 x float> %x) { -; CHECK-LABEL: @fadd_fnegx_commute_vec_undef( +define <2 x float> @fadd_fnegx_commute_vec_poison(<2 x float> %x) { +; CHECK-LABEL: @fadd_fnegx_commute_vec_poison( ; CHECK-NEXT: ret <2 x float> zeroinitializer ; - %negx = fsub <2 x float> , %x + %negx = fsub <2 x float> , %x %r = fadd nnan <2 x float> %x, %negx ret <2 x float> %r } @@ -226,30 +226,30 @@ define float @fneg_x(float %a) { ret float %ret } -define <2 x float> @fsub_0_0_x_vec_undef1(<2 x float> %a) { -; CHECK-LABEL: @fsub_0_0_x_vec_undef1( +define <2 x float> @fsub_0_0_x_vec_poison1(<2 x float> %a) { +; CHECK-LABEL: @fsub_0_0_x_vec_poison1( ; CHECK-NEXT: ret <2 x float> [[A:%.*]] ; - %t1 = fsub <2 x float> , %a + %t1 = fsub <2 x float> , %a %ret = fsub nsz <2 x float> zeroinitializer, %t1 ret <2 x float> %ret } -define <2 x float> @fneg_x_vec_undef1(<2 x float> %a) { -; CHECK-LABEL: @fneg_x_vec_undef1( +define <2 x float> @fneg_x_vec_poison1(<2 x float> %a) { +; CHECK-LABEL: @fneg_x_vec_poison1( ; CHECK-NEXT: ret <2 x float> [[A:%.*]] ; %t1 = fneg <2 x float> %a - %ret = fsub nsz <2 x float> , %t1 + %ret = fsub nsz <2 x float> , %t1 ret <2 x float> %ret } -define <2 x float> @fsub_0_0_x_vec_undef2(<2 x float> %a) { -; CHECK-LABEL: @fsub_0_0_x_vec_undef2( +define <2 x float> @fsub_0_0_x_vec_poison2(<2 x float> %a) { +; CHECK-LABEL: @fsub_0_0_x_vec_poison2( ; CHECK-NEXT: ret <2 x float> [[A:%.*]] ; %t1 = fsub <2 x float> zeroinitializer, %a - %ret = fsub nsz <2 x float> , %t1 + %ret = fsub nsz <2 x float> , %t1 ret <2 x float> %ret } @@ -263,11 +263,11 @@ define <2 x float> @fadd_zero_nsz_vec(<2 x float> %x) { ret <2 x float> %r } -define <2 x float> @fadd_zero_nsz_vec_undef(<2 x float> %x) { -; CHECK-LABEL: @fadd_zero_nsz_vec_undef( +define <2 x float> @fadd_zero_nsz_vec_poison(<2 x float> %x) { +; CHECK-LABEL: @fadd_zero_nsz_vec_poison( ; CHECK-NEXT: ret <2 x float> [[X:%.*]] ; - %r = fadd nsz <2 x float> %x, + %r = fadd nsz <2 x float> %x, ret <2 x float> %r } @@ -357,11 +357,11 @@ define double @fdiv_zero_by_x(double %x) { ret double %r } -define <2 x double> @fdiv_zero_by_x_vec_undef(<2 x double> %x) { -; CHECK-LABEL: @fdiv_zero_by_x_vec_undef( +define <2 x double> @fdiv_zero_by_x_vec_poison(<2 x double> %x) { +; CHECK-LABEL: @fdiv_zero_by_x_vec_poison( ; CHECK-NEXT: ret <2 x double> zeroinitializer ; - %r = fdiv nnan nsz <2 x double> , %x + %r = fdiv nnan nsz <2 x double> , %x ret <2 x double> %r } @@ -376,11 +376,11 @@ define double @frem_zero_by_x(double %x) { ret double %r } -define <2 x double> @frem_poszero_by_x_vec_undef(<2 x double> %x) { -; CHECK-LABEL: @frem_poszero_by_x_vec_undef( +define <2 x double> @frem_poszero_by_x_vec_poison(<2 x double> %x) { +; CHECK-LABEL: @frem_poszero_by_x_vec_poison( ; CHECK-NEXT: ret <2 x double> zeroinitializer ; - %r = frem nnan <2 x double> , %x + %r = frem nnan <2 x double> , %x ret <2 x double> %r } @@ -395,11 +395,11 @@ define double @frem_negzero_by_x(double %x) { ret double %r } -define <2 x double> @frem_negzero_by_x_vec_undef(<2 x double> %x) { -; CHECK-LABEL: @frem_negzero_by_x_vec_undef( +define <2 x double> @frem_negzero_by_x_vec_poison(<2 x double> %x) { +; CHECK-LABEL: @frem_negzero_by_x_vec_poison( ; CHECK-NEXT: ret <2 x double> ; - %r = frem nnan <2 x double> , %x + %r = frem nnan <2 x double> , %x ret <2 x double> %r } @@ -467,11 +467,11 @@ define float @fdiv_neg_swapped2(float %f) { ret float %div } -define <2 x float> @fdiv_neg_vec_undef_elt(<2 x float> %f) { -; CHECK-LABEL: @fdiv_neg_vec_undef_elt( +define <2 x float> @fdiv_neg_vec_poison_elt(<2 x float> %f) { +; CHECK-LABEL: @fdiv_neg_vec_poison_elt( ; CHECK-NEXT: ret <2 x float> ; - %neg = fsub <2 x float> , %f + %neg = fsub <2 x float> , %f %div = fdiv nnan <2 x float> %f, %neg ret <2 x float> %div } diff --git a/llvm/test/Transforms/InstSimplify/fdiv.ll b/llvm/test/Transforms/InstSimplify/fdiv.ll index 38e31257e185ae..fb59011b91d5bd 100644 --- a/llvm/test/Transforms/InstSimplify/fdiv.ll +++ b/llvm/test/Transforms/InstSimplify/fdiv.ll @@ -110,11 +110,11 @@ define <2 x float> @fdiv_nnan_ninf_by_undef_v2f32(<2 x float> %x) { ret <2 x float> %fdiv } -define <2 x float> @fdiv_nnan_ninf_by_zero_undef_v2f32(<2 x float> %x) { -; CHECK-LABEL: @fdiv_nnan_ninf_by_zero_undef_v2f32( +define <2 x float> @fdiv_nnan_ninf_by_zero_poison_v2f32(<2 x float> %x) { +; CHECK-LABEL: @fdiv_nnan_ninf_by_zero_poison_v2f32( ; CHECK-NEXT: ret <2 x float> poison ; - %fdiv = fdiv nnan ninf <2 x float> %x, + %fdiv = fdiv nnan ninf <2 x float> %x, ret <2 x float> %fdiv } diff --git a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic-strictfp.ll b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic-strictfp.ll index e4748a24029236..32ea4cb7cd198d 100644 --- a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic-strictfp.ll +++ b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic-strictfp.ll @@ -24,23 +24,23 @@ define <2 x float> @fsub_-0_x_vec(<2 x float> %a) #0 { ret <2 x float> %ret } -define <2 x float> @fsub_-0_x_vec_undef_elts(<2 x float> %a) #0 { -; CHECK-LABEL: @fsub_-0_x_vec_undef_elts( -; CHECK-NEXT: [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") +define <2 x float> @fsub_-0_x_vec_poison_elts(<2 x float> %a) #0 { +; CHECK-LABEL: @fsub_-0_x_vec_poison_elts( +; CHECK-NEXT: [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: [[RET:%.*]] = fneg <2 x float> [[T1]] ; CHECK-NEXT: ret <2 x float> [[RET]] ; - %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float>, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float>, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore") %ret = fneg <2 x float> %t1 ret <2 x float> %ret } -define <2 x float> @fsub_negzero_vec_undef_elts(<2 x float> %x) #0 { -; CHECK-LABEL: @fsub_negzero_vec_undef_elts( -; CHECK-NEXT: [[R:%.*]] = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[X:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") +define <2 x float> @fsub_negzero_vec_poison_elts(<2 x float> %x) #0 { +; CHECK-LABEL: @fsub_negzero_vec_poison_elts( +; CHECK-NEXT: [[R:%.*]] = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[X:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: ret <2 x float> [[R]] ; - %r = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float>, <2 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %r = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float>, <2 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %r } @@ -86,23 +86,23 @@ define <2 x float> @fneg_x_vec(<2 x float> %a) #0 { ret <2 x float> %ret } -define <2 x float> @fsub_-0_-0_x_vec_undef_elts(<2 x float> %a) #0 { -; CHECK-LABEL: @fsub_-0_-0_x_vec_undef_elts( -; CHECK-NEXT: [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") -; CHECK-NEXT: [[RET:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[T1]], metadata !"round.tonearest", metadata !"fpexcept.ignore") +define <2 x float> @fsub_-0_-0_x_vec_poison_elts(<2 x float> %a) #0 { +; CHECK-LABEL: @fsub_-0_-0_x_vec_poison_elts( +; CHECK-NEXT: [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore") +; CHECK-NEXT: [[RET:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> , <2 x float> [[T1]], metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: ret <2 x float> [[RET]] ; - %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float>, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore") - %ret = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float>, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %ret = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %ret } -define <2 x float> @fneg_x_vec_undef_elts(<2 x float> %a) #0 { -; CHECK-LABEL: @fneg_x_vec_undef_elts( +define <2 x float> @fneg_x_vec_poison_elts(<2 x float> %a) #0 { +; CHECK-LABEL: @fneg_x_vec_poison_elts( ; CHECK-NEXT: ret <2 x float> [[A:%.*]] ; %t1 = fneg <2 x float> %a - %ret = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %ret = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %ret } @@ -139,11 +139,11 @@ define float @fsub_x_0(float %x) #0 { ret float %r } -define <2 x float> @fsub_x_0_vec_undef(<2 x float> %x) #0 { -; CHECK-LABEL: @fsub_x_0_vec_undef( +define <2 x float> @fsub_x_0_vec_poison(<2 x float> %x) #0 { +; CHECK-LABEL: @fsub_x_0_vec_poison( ; CHECK-NEXT: ret <2 x float> [[X:%.*]] ; - %r = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float>, metadata !"round.tonearest", metadata !"fpexcept.ignore") + %r = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float>, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %r } @@ -156,11 +156,11 @@ define float @fadd_x_n0(float %a) #0 { ret float %ret } -define <2 x float> @fadd_x_n0_vec_undef_elt(<2 x float> %a) #0 { -; CHECK-LABEL: @fadd_x_n0_vec_undef_elt( +define <2 x float> @fadd_x_n0_vec_poison_elt(<2 x float> %a) #0 { +; CHECK-LABEL: @fadd_x_n0_vec_poison_elt( ; CHECK-NEXT: ret <2 x float> [[A:%.*]] ; - %ret = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %a, <2 x float> , metadata !"round.tonearest", metadata !"fpexcept.ignore") + %ret = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %a, <2 x float> , metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %ret } @@ -174,12 +174,12 @@ define float @fadd_x_p0(float %a) #0 { ret float %ret } -define <2 x float> @fadd_x_p0_vec_undef_elt(<2 x float> %a) #0 { -; CHECK-LABEL: @fadd_x_p0_vec_undef_elt( -; CHECK-NEXT: [[RET:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[A:%.*]], <2 x float> , metadata !"round.tonearest", metadata !"fpexcept.ignore") +define <2 x float> @fadd_x_p0_vec_poison_elt(<2 x float> %a) #0 { +; CHECK-LABEL: @fadd_x_p0_vec_poison_elt( +; CHECK-NEXT: [[RET:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[A:%.*]], <2 x float> , metadata !"round.tonearest", metadata !"fpexcept.ignore") ; CHECK-NEXT: ret <2 x float> [[RET]] ; - %ret = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %a, <2 x float> , metadata !"round.tonearest", metadata !"fpexcept.ignore") + %ret = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %a, <2 x float> , metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %ret } diff --git a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll index 5d17504c09df67..7a35f09f03b995 100644 --- a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll +++ b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll @@ -30,20 +30,20 @@ define <2 x float> @fsub_-0_x_vec(<2 x float> %a) { ret <2 x float> %ret } -define <2 x float> @fsub_-0_x_vec_undef_elts(<2 x float> %a) { -; CHECK-LABEL: @fsub_-0_x_vec_undef_elts( +define <2 x float> @fsub_-0_x_vec_poison_elts(<2 x float> %a) { +; CHECK-LABEL: @fsub_-0_x_vec_poison_elts( ; CHECK-NEXT: ret <2 x float> [[A:%.*]] ; - %t1 = fsub <2 x float> , %a + %t1 = fsub <2 x float> , %a %ret = fneg <2 x float> %t1 ret <2 x float> %ret } -define <2 x float> @fsub_negzero_vec_undef_elts(<2 x float> %x) { -; CHECK-LABEL: @fsub_negzero_vec_undef_elts( +define <2 x float> @fsub_negzero_vec_poison_elts(<2 x float> %x) { +; CHECK-LABEL: @fsub_negzero_vec_poison_elts( ; CHECK-NEXT: ret <2 x float> [[X:%.*]] ; - %r = fsub nsz <2 x float> %x, + %r = fsub nsz <2 x float> %x, ret <2 x float> %r } @@ -85,21 +85,21 @@ define <2 x float> @fneg_x_vec(<2 x float> %a) { ret <2 x float> %ret } -define <2 x float> @fsub_-0_-0_x_vec_undef_elts(<2 x float> %a) { -; CHECK-LABEL: @fsub_-0_-0_x_vec_undef_elts( +define <2 x float> @fsub_-0_-0_x_vec_poison_elts(<2 x float> %a) { +; CHECK-LABEL: @fsub_-0_-0_x_vec_poison_elts( ; CHECK-NEXT: ret <2 x float> [[A:%.*]] ; - %t1 = fsub <2 x float> , %a - %ret = fsub <2 x float> , %t1 + %t1 = fsub <2 x float> , %a + %ret = fsub <2 x float> , %t1 ret <2 x float> %ret } -define <2 x float> @fneg_x_vec_undef_elts(<2 x float> %a) { -; CHECK-LABEL: @fneg_x_vec_undef_elts( +define <2 x float> @fneg_x_vec_poison_elts(<2 x float> %a) { +; CHECK-LABEL: @fneg_x_vec_poison_elts( ; CHECK-NEXT: ret <2 x float> [[A:%.*]] ; %t1 = fneg <2 x float> %a - %ret = fsub <2 x float> , %t1 + %ret = fsub <2 x float> , %t1 ret <2 x float> %ret } @@ -136,11 +136,11 @@ define float @fsub_x_0(float %x) { ret float %r } -define <2 x float> @fsub_x_0_vec_undef(<2 x float> %x) { -; CHECK-LABEL: @fsub_x_0_vec_undef( +define <2 x float> @fsub_x_0_vec_poison(<2 x float> %x) { +; CHECK-LABEL: @fsub_x_0_vec_poison( ; CHECK-NEXT: ret <2 x float> [[X:%.*]] ; - %r = fsub <2 x float> %x, + %r = fsub <2 x float> %x, ret <2 x float> %r } @@ -153,11 +153,11 @@ define float @fadd_x_n0(float %a) { ret float %ret } -define <2 x float> @fadd_x_n0_vec_undef_elt(<2 x float> %a) { -; CHECK-LABEL: @fadd_x_n0_vec_undef_elt( +define <2 x float> @fadd_x_n0_vec_poison_elt(<2 x float> %a) { +; CHECK-LABEL: @fadd_x_n0_vec_poison_elt( ; CHECK-NEXT: ret <2 x float> [[A:%.*]] ; - %ret = fadd <2 x float> %a, + %ret = fadd <2 x float> %a, ret <2 x float> %ret } diff --git a/llvm/test/Transforms/InstSimplify/floating-point-compare.ll b/llvm/test/Transforms/InstSimplify/floating-point-compare.ll index 3c1794c81284d7..70f0321039ea94 100644 --- a/llvm/test/Transforms/InstSimplify/floating-point-compare.ll +++ b/llvm/test/Transforms/InstSimplify/floating-point-compare.ll @@ -547,30 +547,30 @@ define <2 x i1> @fabs_is_not_negative_anyzero(<2 x float> %V) { ret <2 x i1> %cmp } -define <3 x i1> @fabs_is_not_negative_negzero_undef(<3 x float> %V) { -; CHECK-LABEL: @fabs_is_not_negative_negzero_undef( +define <3 x i1> @fabs_is_not_negative_negzero_poison(<3 x float> %V) { +; CHECK-LABEL: @fabs_is_not_negative_negzero_poison( ; CHECK-NEXT: ret <3 x i1> zeroinitializer ; %abs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %V) - %cmp = fcmp olt <3 x float> %abs, + %cmp = fcmp olt <3 x float> %abs, ret <3 x i1> %cmp } -define <3 x i1> @fabs_is_not_negative_poszero_undef(<3 x float> %V) { -; CHECK-LABEL: @fabs_is_not_negative_poszero_undef( +define <3 x i1> @fabs_is_not_negative_poszero_poison(<3 x float> %V) { +; CHECK-LABEL: @fabs_is_not_negative_poszero_poison( ; CHECK-NEXT: ret <3 x i1> zeroinitializer ; %abs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %V) - %cmp = fcmp olt <3 x float> %abs, + %cmp = fcmp olt <3 x float> %abs, ret <3 x i1> %cmp } -define <3 x i1> @fabs_is_not_negative_anyzero_undef(<3 x float> %V) { -; CHECK-LABEL: @fabs_is_not_negative_anyzero_undef( +define <3 x i1> @fabs_is_not_negative_anyzero_poison(<3 x float> %V) { +; CHECK-LABEL: @fabs_is_not_negative_anyzero_poison( ; CHECK-NEXT: ret <3 x i1> zeroinitializer ; %abs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %V) - %cmp = fcmp olt <3 x float> %abs, + %cmp = fcmp olt <3 x float> %abs, ret <3 x i1> %cmp } @@ -1335,19 +1335,19 @@ define <2 x i1> @orderedCompareWithNaNVector(<2 x double> %A) { ret <2 x i1> %cmp } -define <2 x i1> @orderedCompareWithNaNVector_undef_elt(<2 x double> %A) { -; CHECK-LABEL: @orderedCompareWithNaNVector_undef_elt( +define <2 x i1> @orderedCompareWithNaNVector_poison_elt(<2 x double> %A) { +; CHECK-LABEL: @orderedCompareWithNaNVector_poison_elt( ; CHECK-NEXT: ret <2 x i1> zeroinitializer ; - %cmp = fcmp olt <2 x double> %A, + %cmp = fcmp olt <2 x double> %A, ret <2 x i1> %cmp } -define <2 x i1> @unorderedCompareWithNaNVector_undef_elt(<2 x double> %A) { -; CHECK-LABEL: @unorderedCompareWithNaNVector_undef_elt( +define <2 x i1> @unorderedCompareWithNaNVector_poison_elt(<2 x double> %A) { +; CHECK-LABEL: @unorderedCompareWithNaNVector_poison_elt( ; CHECK-NEXT: ret <2 x i1> ; - %cmp = fcmp ult <2 x double> %A, + %cmp = fcmp ult <2 x double> %A, ret <2 x i1> %cmp } diff --git a/llvm/test/Transforms/InstSimplify/fminmax-folds.ll b/llvm/test/Transforms/InstSimplify/fminmax-folds.ll index a8a9e96a652fa6..668a93ddf5a426 100644 --- a/llvm/test/Transforms/InstSimplify/fminmax-folds.ll +++ b/llvm/test/Transforms/InstSimplify/fminmax-folds.ll @@ -493,7 +493,7 @@ define <2 x double> @maxnum_nan_op0_vec(<2 x double> %x) { ; CHECK-LABEL: @maxnum_nan_op0_vec( ; CHECK-NEXT: ret <2 x double> [[X:%.*]] ; - %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> , <2 x double> %x) + %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> , <2 x double> %x) ret <2 x double> %r } @@ -509,7 +509,7 @@ define <2 x double> @minnum_nan_op0_vec(<2 x double> %x) { ; CHECK-LABEL: @minnum_nan_op0_vec( ; CHECK-NEXT: ret <2 x double> [[X:%.*]] ; - %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> , <2 x double> %x) + %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> , <2 x double> %x) ret <2 x double> %r } @@ -873,19 +873,19 @@ define double @minimum_nan_op1(double %x) { ret double %r } -define <2 x double> @maximum_nan_op0_vec_partial_undef(<2 x double> %x) { -; CHECK-LABEL: @maximum_nan_op0_vec_partial_undef( -; CHECK-NEXT: ret <2 x double> +define <2 x double> @maximum_nan_op0_vec_partial_poison(<2 x double> %x) { +; CHECK-LABEL: @maximum_nan_op0_vec_partial_poison( +; CHECK-NEXT: ret <2 x double> ; - %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> , <2 x double> %x) + %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> , <2 x double> %x) ret <2 x double> %r } -define <2 x double> @maximum_nan_op1_vec_partial_undef(<2 x double> %x) { -; CHECK-LABEL: @maximum_nan_op1_vec_partial_undef( -; CHECK-NEXT: ret <2 x double> +define <2 x double> @maximum_nan_op1_vec_partial_poison(<2 x double> %x) { +; CHECK-LABEL: @maximum_nan_op1_vec_partial_poison( +; CHECK-NEXT: ret <2 x double> ; - %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> ) + %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> ) ret <2 x double> %r } @@ -897,19 +897,19 @@ define <2 x double> @maximum_nan_op1_vec(<2 x double> %x) { ret <2 x double> %r } -define <2 x double> @minimum_nan_op0_vec_partial_undef(<2 x double> %x) { -; CHECK-LABEL: @minimum_nan_op0_vec_partial_undef( -; CHECK-NEXT: ret <2 x double> +define <2 x double> @minimum_nan_op0_vec_partial_poison(<2 x double> %x) { +; CHECK-LABEL: @minimum_nan_op0_vec_partial_poison( +; CHECK-NEXT: ret <2 x double> ; - %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> %x) + %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> %x) ret <2 x double> %r } -define <2 x double> @minimum_nan_op1_vec_partial_undef(<2 x double> %x) { -; CHECK-LABEL: @minimum_nan_op1_vec_partial_undef( -; CHECK-NEXT: ret <2 x double> +define <2 x double> @minimum_nan_op1_vec_partial_poison(<2 x double> %x) { +; CHECK-LABEL: @minimum_nan_op1_vec_partial_poison( +; CHECK-NEXT: ret <2 x double> ; - %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> ) + %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> ) ret <2 x double> %r } diff --git a/llvm/test/Transforms/InstSimplify/fp-nan.ll b/llvm/test/Transforms/InstSimplify/fp-nan.ll index cb0bed3790782b..bb557500822c14 100644 --- a/llvm/test/Transforms/InstSimplify/fp-nan.ll +++ b/llvm/test/Transforms/InstSimplify/fp-nan.ll @@ -163,13 +163,13 @@ define <2 x double> @fsub_nan_poison_op1(<2 x double> %x) { ret <2 x double> %r } -; Vector with undef element +; Vector with poison element -define <2 x double> @frem_nan_undef_op0(<2 x double> %x) { -; CHECK-LABEL: @frem_nan_undef_op0( -; CHECK-NEXT: ret <2 x double> +define <2 x double> @frem_nan_poison_op0(<2 x double> %x) { +; CHECK-LABEL: @frem_nan_poison_op0( +; CHECK-NEXT: ret <2 x double> ; - %r = frem <2 x double> , %x + %r = frem <2 x double> , %x ret <2 x double> %r } @@ -177,7 +177,8 @@ define <2 x double> @frem_nan_undef_op0(<2 x double> %x) { define <3 x double> @fadd_nan_poison_undef_op1(<3 x double> %x) { ; CHECK-LABEL: @fadd_nan_poison_undef_op1( -; CHECK-NEXT: ret <3 x double> +; CHECK-NEXT: [[R:%.*]] = fadd <3 x double> [[X:%.*]], +; CHECK-NEXT: ret <3 x double> [[R]] ; %r = fadd <3 x double> %x, ret <3 x double> %r diff --git a/llvm/test/Transforms/InstSimplify/icmp-bool-constant.ll b/llvm/test/Transforms/InstSimplify/icmp-bool-constant.ll index 6205225098a7a3..a501f995b6c975 100644 --- a/llvm/test/Transforms/InstSimplify/icmp-bool-constant.ll +++ b/llvm/test/Transforms/InstSimplify/icmp-bool-constant.ll @@ -12,11 +12,11 @@ define <2 x i1> @eq_t(<2 x i1> %a) { ret <2 x i1> %r } -define <2 x i1> @eq_t_undef_elt(<2 x i1> %a) { -; CHECK-LABEL: @eq_t_undef_elt( +define <2 x i1> @eq_t_poison_elt(<2 x i1> %a) { +; CHECK-LABEL: @eq_t_poison_elt( ; CHECK-NEXT: ret <2 x i1> [[A:%.*]] ; - %r = icmp eq <2 x i1> %a, + %r = icmp eq <2 x i1> %a, ret <2 x i1> %r } @@ -54,11 +54,11 @@ define <2 x i1> @ugt_t(<2 x i1> %a) { ret <2 x i1> %r } -define <2 x i1> @ugt_t_undef_elt(<2 x i1> %a) { -; CHECK-LABEL: @ugt_t_undef_elt( +define <2 x i1> @ugt_t_poison_elt(<2 x i1> %a) { +; CHECK-LABEL: @ugt_t_poison_elt( ; CHECK-NEXT: ret <2 x i1> zeroinitializer ; - %r = icmp ugt <2 x i1> %a, + %r = icmp ugt <2 x i1> %a, ret <2 x i1> %r } @@ -161,11 +161,11 @@ define <2 x i1> @sge_t(<2 x i1> %a) { ret <2 x i1> %r } -define <2 x i1> @sge_t_undef_elt(<2 x i1> %a) { -; CHECK-LABEL: @sge_t_undef_elt( +define <2 x i1> @sge_t_poison_elt(<2 x i1> %a) { +; CHECK-LABEL: @sge_t_poison_elt( ; CHECK-NEXT: ret <2 x i1> ; - %r = icmp sge <2 x i1> %a, + %r = icmp sge <2 x i1> %a, ret <2 x i1> %r } diff --git a/llvm/test/Transforms/InstSimplify/icmp-not-bool-constant.ll b/llvm/test/Transforms/InstSimplify/icmp-not-bool-constant.ll index f4a0b6ddf66214..045d773bf32841 100644 --- a/llvm/test/Transforms/InstSimplify/icmp-not-bool-constant.ll +++ b/llvm/test/Transforms/InstSimplify/icmp-not-bool-constant.ll @@ -33,11 +33,11 @@ define <2 x i1> @eq_f_not_swap(<2 x i1> %a) { ret <2 x i1> %r } -define <2 x i1> @eq_f_not_undef(<2 x i1> %a) { -; CHECK-LABEL: @eq_f_not_undef( +define <2 x i1> @eq_f_not_poison(<2 x i1> %a) { +; CHECK-LABEL: @eq_f_not_poison( ; CHECK-NEXT: ret <2 x i1> [[A:%.*]] ; - %not = xor <2 x i1> %a, + %not = xor <2 x i1> %a, %r = icmp eq <2 x i1> %not, ret <2 x i1> %r } @@ -60,11 +60,11 @@ define <2 x i1> @ne_t_not_swap(<2 x i1> %a) { ret <2 x i1> %r } -define <2 x i1> @ne_t_not_undef(<2 x i1> %a) { -; CHECK-LABEL: @ne_t_not_undef( +define <2 x i1> @ne_t_not_poison(<2 x i1> %a) { +; CHECK-LABEL: @ne_t_not_poison( ; CHECK-NEXT: ret <2 x i1> [[A:%.*]] ; - %not = xor <2 x i1> %a, + %not = xor <2 x i1> %a, %r = icmp ne <2 x i1> %not, ret <2 x i1> %r } @@ -116,11 +116,11 @@ define <2 x i1> @ult_t_not_swap(<2 x i1> %a) { ret <2 x i1> %r } -define <2 x i1> @ult_t_not_undef(<2 x i1> %a) { -; CHECK-LABEL: @ult_t_not_undef( +define <2 x i1> @ult_t_not_poison(<2 x i1> %a) { +; CHECK-LABEL: @ult_t_not_poison( ; CHECK-NEXT: ret <2 x i1> [[A:%.*]] ; - %not = xor <2 x i1> %a, + %not = xor <2 x i1> %a, %r = icmp ult <2 x i1> %not, ret <2 x i1> %r } @@ -152,11 +152,11 @@ define <2 x i1> @sgt_t_not_swap(<2 x i1> %a) { ret <2 x i1> %r } -define <2 x i1> @sgt_t_not_undef(<2 x i1> %a) { -; CHECK-LABEL: @sgt_t_not_undef( +define <2 x i1> @sgt_t_not_poison(<2 x i1> %a) { +; CHECK-LABEL: @sgt_t_not_poison( ; CHECK-NEXT: ret <2 x i1> [[A:%.*]] ; - %not = xor <2 x i1> %a, + %not = xor <2 x i1> %a, %r = icmp sgt <2 x i1> %not, ret <2 x i1> %r } @@ -235,11 +235,11 @@ define <2 x i1> @ule_f_not_swap(<2 x i1> %a) { ret <2 x i1> %r } -define <2 x i1> @ule_f_not_undef(<2 x i1> %a) { -; CHECK-LABEL: @ule_f_not_undef( +define <2 x i1> @ule_f_not_poison(<2 x i1> %a) { +; CHECK-LABEL: @ule_f_not_poison( ; CHECK-NEXT: ret <2 x i1> [[A:%.*]] ; - %not = xor <2 x i1> %a, + %not = xor <2 x i1> %a, %r = icmp ule <2 x i1> %not, ret <2 x i1> %r } @@ -271,11 +271,11 @@ define <2 x i1> @sge_f_not_swap(<2 x i1> %a) { ret <2 x i1> %r } -define <2 x i1> @sge_f_not_undef(<2 x i1> %a) { -; CHECK-LABEL: @sge_f_not_undef( +define <2 x i1> @sge_f_not_poison(<2 x i1> %a) { +; CHECK-LABEL: @sge_f_not_poison( ; CHECK-NEXT: ret <2 x i1> [[A:%.*]] ; - %not = xor <2 x i1> %a, + %not = xor <2 x i1> %a, %r = icmp sge <2 x i1> %not, ret <2 x i1> %r } diff --git a/llvm/test/Transforms/InstSimplify/ldexp.ll b/llvm/test/Transforms/InstSimplify/ldexp.ll index c6bb0141199f21..d39f6a1e49673f 100644 --- a/llvm/test/Transforms/InstSimplify/ldexp.ll +++ b/llvm/test/Transforms/InstSimplify/ldexp.ll @@ -57,11 +57,12 @@ define void @ldexp_f32_exp0(float %x) { define void @ldexp_v2f32_exp0(<2 x float> %x) { ; CHECK-LABEL: @ldexp_v2f32_exp0( ; CHECK-NEXT: store volatile <2 x float> [[X:%.*]], ptr addrspace(1) undef, align 8 -; CHECK-NEXT: store volatile <2 x float> [[X]], ptr addrspace(1) undef, align 8 +; CHECK-NEXT: [[PART_UNDEF1:%.*]] = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> [[X]], <2 x i32> ) +; CHECK-NEXT: store volatile <2 x float> [[PART_UNDEF1]], ptr addrspace(1) undef, align 8 ; CHECK-NEXT: store volatile <2 x float> [[X]], ptr addrspace(1) undef, align 8 ; CHECK-NEXT: ret void ; - %part.undef0 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> ) + %part.undef0 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> ) store volatile <2 x float> %part.undef0, ptr addrspace(1) undef %part.undef1 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> ) diff --git a/llvm/test/Transforms/InstSimplify/mul.ll b/llvm/test/Transforms/InstSimplify/mul.ll index 8ae7f1eaac92b0..a1b03a30fe4f42 100644 --- a/llvm/test/Transforms/InstSimplify/mul.ll +++ b/llvm/test/Transforms/InstSimplify/mul.ll @@ -34,11 +34,11 @@ define <16 x i8> @mul_by_0_vec(<16 x i8> %a) { ret <16 x i8> %b } -define <2 x i8> @mul_by_0_vec_undef_elt(<2 x i8> %a) { -; CHECK-LABEL: @mul_by_0_vec_undef_elt( +define <2 x i8> @mul_by_0_vec_poison_elt(<2 x i8> %a) { +; CHECK-LABEL: @mul_by_0_vec_poison_elt( ; CHECK-NEXT: ret <2 x i8> zeroinitializer ; - %b = mul <2 x i8> %a, + %b = mul <2 x i8> %a, ret <2 x i8> %b } diff --git a/llvm/test/Transforms/InstSimplify/negate.ll b/llvm/test/Transforms/InstSimplify/negate.ll index d72a0db6d445cd..d07029becd1fe9 100644 --- a/llvm/test/Transforms/InstSimplify/negate.ll +++ b/llvm/test/Transforms/InstSimplify/negate.ll @@ -17,11 +17,11 @@ define <2 x i32> @negate_nuw_vec(<2 x i32> %x) { ret <2 x i32> %neg } -define <2 x i32> @negate_nuw_vec_undef_elt(<2 x i32> %x) { -; CHECK-LABEL: @negate_nuw_vec_undef_elt( +define <2 x i32> @negate_nuw_vec_poison_elt(<2 x i32> %x) { +; CHECK-LABEL: @negate_nuw_vec_poison_elt( ; CHECK-NEXT: ret <2 x i32> zeroinitializer ; - %neg = sub nuw <2 x i32> , %x + %neg = sub nuw <2 x i32> , %x ret <2 x i32> %neg } @@ -43,12 +43,12 @@ define <2 x i8> @negate_zero_or_minsigned_nsw_vec(<2 x i8> %x) { ret <2 x i8> %neg } -define <2 x i8> @negate_zero_or_minsigned_nsw_vec_undef_elt(<2 x i8> %x) { -; CHECK-LABEL: @negate_zero_or_minsigned_nsw_vec_undef_elt( +define <2 x i8> @negate_zero_or_minsigned_nsw_vec_poison_elt(<2 x i8> %x) { +; CHECK-LABEL: @negate_zero_or_minsigned_nsw_vec_poison_elt( ; CHECK-NEXT: ret <2 x i8> zeroinitializer ; %signbit = shl <2 x i8> %x, - %neg = sub nsw <2 x i8> , %signbit + %neg = sub nsw <2 x i8> , %signbit ret <2 x i8> %neg } diff --git a/llvm/test/Transforms/InstSimplify/or.ll b/llvm/test/Transforms/InstSimplify/or.ll index 913b760dd331ce..f241c6987b9e70 100644 --- a/llvm/test/Transforms/InstSimplify/or.ll +++ b/llvm/test/Transforms/InstSimplify/or.ll @@ -17,11 +17,11 @@ define i32 @all_ones(i32 %A) { ret i32 %B } -define <3 x i8> @all_ones_vec_with_undef_elt(<3 x i8> %A) { -; CHECK-LABEL: @all_ones_vec_with_undef_elt( +define <3 x i8> @all_ones_vec_with_poison_elt(<3 x i8> %A) { +; CHECK-LABEL: @all_ones_vec_with_poison_elt( ; CHECK-NEXT: ret <3 x i8> ; - %B = or <3 x i8> %A, + %B = or <3 x i8> %A, ret <3 x i8> %B } @@ -68,11 +68,11 @@ define i32 @or_not(i32 %A) { ret i32 %B } -define <2 x i4> @or_not_commute_vec_undef(<2 x i4> %A) { -; CHECK-LABEL: @or_not_commute_vec_undef( +define <2 x i4> @or_not_commute_vec_poison(<2 x i4> %A) { +; CHECK-LABEL: @or_not_commute_vec_poison( ; CHECK-NEXT: ret <2 x i4> ; - %NotA = xor <2 x i4> %A, + %NotA = xor <2 x i4> %A, %B = or <2 x i4> %NotA, %A ret <2 x i4> %B } @@ -335,7 +335,7 @@ define <2 x i1> @or_with_not_op_commute4(<2 x i1> %a, <2 x i1> %b) { ; CHECK-NEXT: ret <2 x i1> ; %ab = and <2 x i1> %b, %a - %not = xor <2 x i1> %ab, + %not = xor <2 x i1> %ab, %r = or <2 x i1> %not, %a ret <2 x i1> %r } @@ -515,6 +515,21 @@ define <2 x i4> @and_or_not_or_commute7_undef_elt(<2 x i4> %A, <2 x i4> %B) { ret <2 x i4> %r } +; doing the same with poison is safe. + +define <2 x i4> @and_or_not_or_commute7_poison_elt(<2 x i4> %A, <2 x i4> %B) { +; CHECK-LABEL: @and_or_not_or_commute7_poison_elt( +; CHECK-NEXT: [[NOTA:%.*]] = xor <2 x i4> [[A:%.*]], +; CHECK-NEXT: ret <2 x i4> [[NOTA]] +; + %nota = xor <2 x i4> %A, + %and = and <2 x i4> %B, %nota + %or = or <2 x i4> %B, %A + %notab = xor <2 x i4> %or, + %r = or <2 x i4> %notab, %and + ret <2 x i4> %r +} + ; (A | B) | (A ^ B) --> A | B define i69 @or_or_xor(i69 %A, i69 %B) { @@ -769,6 +784,21 @@ define <2 x i4> @or_nxor_and_undef_elt(<2 x i4> %a, <2 x i4> %b) { ret <2 x i4> %r } +; Same with poison is safe. + +define <2 x i4> @or_nxor_and_poison_elt(<2 x i4> %a, <2 x i4> %b) { +; CHECK-LABEL: @or_nxor_and_poison_elt( +; CHECK-NEXT: [[XOR:%.*]] = xor <2 x i4> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[NOT:%.*]] = xor <2 x i4> [[XOR]], +; CHECK-NEXT: ret <2 x i4> [[NOT]] +; + %and = and <2 x i4> %b, %a + %xor = xor <2 x i4> %a, %b + %not = xor <2 x i4> %xor, + %r = or <2 x i4> %not, %and + ret <2 x i4> %r +} + ; ~(A ^ B) | (A | B) --> -1 define i4 @or_nxor_or_commute0(i4 %a, i4 %b) { @@ -849,15 +879,15 @@ define i4 @or_nxor_or_wrong_val2(i4 %a, i4 %b, i4 %c) { ret i4 %r } -; negative test - undef in 'not' is allowed +; negative test - poison in 'not' is allowed -define <2 x i4> @or_nxor_or_undef_elt(<2 x i4> %a, <2 x i4> %b) { -; CHECK-LABEL: @or_nxor_or_undef_elt( +define <2 x i4> @or_nxor_or_poison_elt(<2 x i4> %a, <2 x i4> %b) { +; CHECK-LABEL: @or_nxor_or_poison_elt( ; CHECK-NEXT: ret <2 x i4> ; %or = or <2 x i4> %b, %a %xor = xor <2 x i4> %a, %b - %not = xor <2 x i4> %xor, + %not = xor <2 x i4> %xor, %r = or <2 x i4> %or, %not ret <2 x i4> %r } @@ -966,12 +996,12 @@ define i32 @or_xor_not_op_or_commute7(i32 %a, i32 %b){ ret i32 %r } -define <2 x i4> @or_xor_not_op_or_undef_elt(<2 x i4> %a, <2 x i4> %b) { -; CHECK-LABEL: @or_xor_not_op_or_undef_elt( +define <2 x i4> @or_xor_not_op_or_poison_elt(<2 x i4> %a, <2 x i4> %b) { +; CHECK-LABEL: @or_xor_not_op_or_poison_elt( ; CHECK-NEXT: ret <2 x i4> ; %xor = xor <2 x i4> %a, %b - %nota = xor <2 x i4> %a, + %nota = xor <2 x i4> %a, %or = or <2 x i4> %nota, %b %r = or <2 x i4> %xor, %or ret <2 x i4> %r @@ -1082,6 +1112,21 @@ define <2 x i4> @or_nand_xor_undef_elt(<2 x i4> %x, <2 x i4> %y) { ret <2 x i4> %or } +; Same with poison is safe. + +define <2 x i4> @or_nand_xor_poison_elt(<2 x i4> %x, <2 x i4> %y) { +; CHECK-LABEL: @or_nand_xor_poison_elt( +; CHECK-NEXT: [[AND:%.*]] = and <2 x i4> [[Y:%.*]], [[X:%.*]] +; CHECK-NEXT: [[NAND:%.*]] = xor <2 x i4> [[AND]], +; CHECK-NEXT: ret <2 x i4> [[NAND]] +; + %and = and <2 x i4> %y, %x + %xor = xor <2 x i4> %x, %y + %nand = xor <2 x i4> %and, + %or = or <2 x i4> %xor, %nand + ret <2 x i4> %or +} + declare i32 @llvm.fshl.i32 (i32, i32, i32) declare i32 @llvm.fshr.i32 (i32, i32, i32) diff --git a/llvm/test/Transforms/InstSimplify/ptrmask.ll b/llvm/test/Transforms/InstSimplify/ptrmask.ll index dd83abfdeee464..d2c4a5dd7f0353 100644 --- a/llvm/test/Transforms/InstSimplify/ptrmask.ll +++ b/llvm/test/Transforms/InstSimplify/ptrmask.ll @@ -40,7 +40,8 @@ define <2 x ptr addrspace(1) > @ptrmask_simplify_poison_and_zero_i32_vec_fail(<2 define <2 x ptr> @ptrmask_simplify_undef_and_ones_vec(<2 x ptr> %p) { ; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_undef_and_ones_vec ; CHECK-SAME: (<2 x ptr> [[P:%.*]]) { -; CHECK-NEXT: ret <2 x ptr> [[P]] +; CHECK-NEXT: [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> ) +; CHECK-NEXT: ret <2 x ptr> [[R]] ; %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> ) ret <2 x ptr> %r diff --git a/llvm/test/Transforms/InstSimplify/rem.ll b/llvm/test/Transforms/InstSimplify/rem.ll index 5af3b5f7c5e0bf..a46db0342042fa 100644 --- a/llvm/test/Transforms/InstSimplify/rem.ll +++ b/llvm/test/Transforms/InstSimplify/rem.ll @@ -17,11 +17,11 @@ define <2 x i32> @zero_dividend_vector(<2 x i32> %A) { ret <2 x i32> %B } -define <2 x i32> @zero_dividend_vector_undef_elt(<2 x i32> %A) { -; CHECK-LABEL: @zero_dividend_vector_undef_elt( +define <2 x i32> @zero_dividend_vector_poison_elt(<2 x i32> %A) { +; CHECK-LABEL: @zero_dividend_vector_poison_elt( ; CHECK-NEXT: ret <2 x i32> zeroinitializer ; - %B = urem <2 x i32> , %A + %B = urem <2 x i32> , %A ret <2 x i32> %B } diff --git a/llvm/test/Transforms/InstSimplify/saturating-add-sub.ll b/llvm/test/Transforms/InstSimplify/saturating-add-sub.ll index 6fb12612f2f721..40b22c619f7686 100644 --- a/llvm/test/Transforms/InstSimplify/saturating-add-sub.ll +++ b/llvm/test/Transforms/InstSimplify/saturating-add-sub.ll @@ -44,7 +44,7 @@ define <2 x i8> @uadd_vector_0_commute(<2 x i8> %a) { ; CHECK-LABEL: @uadd_vector_0_commute( ; CHECK-NEXT: ret <2 x i8> [[A:%.*]] ; - %x2v = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> , <2 x i8> %a) + %x2v = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> , <2 x i8> %a) ret <2 x i8> %x2v } @@ -156,7 +156,7 @@ define <2 x i8> @sadd_vector_0(<2 x i8> %a) { ; CHECK-LABEL: @sadd_vector_0( ; CHECK-NEXT: ret <2 x i8> [[A:%.*]] ; - %y1v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a, <2 x i8> ) + %y1v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a, <2 x i8> ) ret <2 x i8> %y1v } @@ -205,10 +205,10 @@ define i8 @sadd_scalar_maxval_commute(i8 %a) { define <2 x i8> @sadd_vector_maxval_commute(<2 x i8> %a) { ; CHECK-LABEL: @sadd_vector_maxval_commute( -; CHECK-NEXT: [[Y4V:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> , <2 x i8> [[A:%.*]]) +; CHECK-NEXT: [[Y4V:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> , <2 x i8> [[A:%.*]]) ; CHECK-NEXT: ret <2 x i8> [[Y4V]] ; - %y4v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> , <2 x i8> %a) + %y4v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> , <2 x i8> %a) ret <2 x i8> %y4v } diff --git a/llvm/test/Transforms/InstSimplify/sdiv.ll b/llvm/test/Transforms/InstSimplify/sdiv.ll index 2514d90b012355..99092802cab025 100644 --- a/llvm/test/Transforms/InstSimplify/sdiv.ll +++ b/llvm/test/Transforms/InstSimplify/sdiv.ll @@ -158,11 +158,11 @@ define <2 x i32> @knownnegation_commute_vec_bad3(<2 x i32> %x, <2 x i32> %y) { ret <2 x i32> %div } -define <3 x i32> @negated_operand_vec_undef(<3 x i32> %x) { -; CHECK-LABEL: @negated_operand_vec_undef( +define <3 x i32> @negated_operand_vec_poison(<3 x i32> %x) { +; CHECK-LABEL: @negated_operand_vec_poison( ; CHECK-NEXT: ret <3 x i32> ; - %negx = sub nsw <3 x i32> , %x + %negx = sub nsw <3 x i32> , %x %div = sdiv <3 x i32> %negx, %x ret <3 x i32> %div } diff --git a/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll index 2a4ce85ed11f8d..fcf8c31b25eed2 100644 --- a/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll +++ b/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll @@ -17,11 +17,11 @@ define <2 x i1> @bool_true_or_false_vec(<2 x i1> %cond) { ret <2 x i1> %s } -define <2 x i1> @bool_true_or_false_vec_undef(<2 x i1> %cond) { -; CHECK-LABEL: @bool_true_or_false_vec_undef( +define <2 x i1> @bool_true_or_false_vec_poison(<2 x i1> %cond) { +; CHECK-LABEL: @bool_true_or_false_vec_poison( ; CHECK-NEXT: ret <2 x i1> [[COND:%.*]] ; - %s = select <2 x i1> %cond, <2 x i1> , <2 x i1> + %s = select <2 x i1> %cond, <2 x i1> , <2 x i1> ret <2 x i1> %s } @@ -57,27 +57,27 @@ define <2 x i32> @equal_arms_vec(<2 x i1> %cond, <2 x i32> %x) { ret <2 x i32> %V } -define <2 x i32> @equal_arms_vec_undef(<2 x i1> %cond) { -; CHECK-LABEL: @equal_arms_vec_undef( +define <2 x i32> @equal_arms_vec_poison(<2 x i1> %cond) { +; CHECK-LABEL: @equal_arms_vec_poison( ; CHECK-NEXT: ret <2 x i32> ; - %V = select <2 x i1> %cond, <2 x i32> , <2 x i32> + %V = select <2 x i1> %cond, <2 x i32> , <2 x i32> ret <2 x i32> %V } -define <3 x float> @equal_arms_vec_less_undef(<3 x i1> %cond) { -; CHECK-LABEL: @equal_arms_vec_less_undef( +define <3 x float> @equal_arms_vec_less_poison(<3 x i1> %cond) { +; CHECK-LABEL: @equal_arms_vec_less_poison( ; CHECK-NEXT: ret <3 x float> ; - %V = select <3 x i1> %cond, <3 x float> , <3 x float> + %V = select <3 x i1> %cond, <3 x float> , <3 x float> ret <3 x float> %V } -define <3 x float> @equal_arms_vec_more_undef(<3 x i1> %cond) { -; CHECK-LABEL: @equal_arms_vec_more_undef( -; CHECK-NEXT: ret <3 x float> +define <3 x float> @equal_arms_vec_more_poison(<3 x i1> %cond) { +; CHECK-LABEL: @equal_arms_vec_more_poison( +; CHECK-NEXT: ret <3 x float> ; - %V = select <3 x i1> %cond, <3 x float> , <3 x float> + %V = select <3 x i1> %cond, <3 x float> , <3 x float> ret <3 x float> %V } @@ -105,19 +105,19 @@ define <2 x i8> @vsel_mixedvec() { ret <2 x i8> %s } -define <3 x i8> @vsel_undef_true_op(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @vsel_undef_true_op( +define <3 x i8> @vsel_poison_true_op(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @vsel_poison_true_op( ; CHECK-NEXT: ret <3 x i8> [[X:%.*]] ; - %s = select <3 x i1>, <3 x i8> %x, <3 x i8> %y + %s = select <3 x i1>, <3 x i8> %x, <3 x i8> %y ret <3 x i8> %s } -define <3 x i4> @vsel_undef_false_op(<3 x i4> %x, <3 x i4> %y) { -; CHECK-LABEL: @vsel_undef_false_op( +define <3 x i4> @vsel_poison_false_op(<3 x i4> %x, <3 x i4> %y) { +; CHECK-LABEL: @vsel_poison_false_op( ; CHECK-NEXT: ret <3 x i4> [[Y:%.*]] ; - %s = select <3 x i1>, <3 x i4> %x, <3 x i4> %y + %s = select <3 x i1>, <3 x i4> %x, <3 x i4> %y ret <3 x i4> %s } diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll index fe93a0c3f2125e..40c1460e3ebc39 100644 --- a/llvm/test/Transforms/InstSimplify/select.ll +++ b/llvm/test/Transforms/InstSimplify/select.ll @@ -25,11 +25,11 @@ define <2 x i1> @bool_true_or_false_vec(<2 x i1> %cond) { ret <2 x i1> %s } -define <2 x i1> @bool_true_or_false_vec_undef(<2 x i1> %cond) { -; CHECK-LABEL: @bool_true_or_false_vec_undef( +define <2 x i1> @bool_true_or_false_vec_poison(<2 x i1> %cond) { +; CHECK-LABEL: @bool_true_or_false_vec_poison( ; CHECK-NEXT: ret <2 x i1> [[COND:%.*]] ; - %s = select <2 x i1> %cond, <2 x i1> , <2 x i1> + %s = select <2 x i1> %cond, <2 x i1> , <2 x i1> ret <2 x i1> %s } @@ -65,27 +65,27 @@ define <2 x i32> @equal_arms_vec(<2 x i1> %cond, <2 x i32> %x) { ret <2 x i32> %V } -define <2 x i32> @equal_arms_vec_undef(<2 x i1> %cond) { -; CHECK-LABEL: @equal_arms_vec_undef( +define <2 x i32> @equal_arms_vec_poison(<2 x i1> %cond) { +; CHECK-LABEL: @equal_arms_vec_poison( ; CHECK-NEXT: ret <2 x i32> ; - %V = select <2 x i1> %cond, <2 x i32> , <2 x i32> + %V = select <2 x i1> %cond, <2 x i32> , <2 x i32> ret <2 x i32> %V } -define <3 x float> @equal_arms_vec_less_undef(<3 x i1> %cond) { -; CHECK-LABEL: @equal_arms_vec_less_undef( +define <3 x float> @equal_arms_vec_less_poison(<3 x i1> %cond) { +; CHECK-LABEL: @equal_arms_vec_less_poison( ; CHECK-NEXT: ret <3 x float> ; - %V = select <3 x i1> %cond, <3 x float> , <3 x float> + %V = select <3 x i1> %cond, <3 x float> , <3 x float> ret <3 x float> %V } -define <3 x float> @equal_arms_vec_more_undef(<3 x i1> %cond) { -; CHECK-LABEL: @equal_arms_vec_more_undef( -; CHECK-NEXT: ret <3 x float> +define <3 x float> @equal_arms_vec_more_poison(<3 x i1> %cond) { +; CHECK-LABEL: @equal_arms_vec_more_poison( +; CHECK-NEXT: ret <3 x float> ; - %V = select <3 x i1> %cond, <3 x float> , <3 x float> + %V = select <3 x i1> %cond, <3 x float> , <3 x float> ret <3 x float> %V } @@ -113,19 +113,19 @@ define <2 x i8> @vsel_mixedvec() { ret <2 x i8> %s } -define <3 x i8> @vsel_undef_true_op(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @vsel_undef_true_op( +define <3 x i8> @vsel_poison_true_op(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @vsel_poison_true_op( ; CHECK-NEXT: ret <3 x i8> [[X:%.*]] ; - %s = select <3 x i1>, <3 x i8> %x, <3 x i8> %y + %s = select <3 x i1>, <3 x i8> %x, <3 x i8> %y ret <3 x i8> %s } -define <3 x i4> @vsel_undef_false_op(<3 x i4> %x, <3 x i4> %y) { -; CHECK-LABEL: @vsel_undef_false_op( +define <3 x i4> @vsel_poison_false_op(<3 x i4> %x, <3 x i4> %y) { +; CHECK-LABEL: @vsel_poison_false_op( ; CHECK-NEXT: ret <3 x i4> [[Y:%.*]] ; - %s = select <3 x i1>, <3 x i4> %x, <3 x i4> %y + %s = select <3 x i1>, <3 x i4> %x, <3 x i4> %y ret <3 x i4> %s } diff --git a/llvm/test/Transforms/InstSimplify/shift.ll b/llvm/test/Transforms/InstSimplify/shift.ll index b562c3c164d52d..a816fcbdeeee00 100644 --- a/llvm/test/Transforms/InstSimplify/shift.ll +++ b/llvm/test/Transforms/InstSimplify/shift.ll @@ -17,11 +17,11 @@ define i41 @shl_0(i41 %X) { ret i41 %B } -define <2 x i41> @shl_0_vec_undef_elt(<2 x i41> %X) { -; CHECK-LABEL: @shl_0_vec_undef_elt( +define <2 x i41> @shl_0_vec_poison_elt(<2 x i41> %X) { +; CHECK-LABEL: @shl_0_vec_poison_elt( ; CHECK-NEXT: ret <2 x i41> zeroinitializer ; - %B = shl <2 x i41> , %X + %B = shl <2 x i41> , %X ret <2 x i41> %B } @@ -41,11 +41,11 @@ define i39 @ashr_0(i39 %X) { ret i39 %B } -define <2 x i141> @ashr_0_vec_undef_elt(<2 x i141> %X) { -; CHECK-LABEL: @ashr_0_vec_undef_elt( +define <2 x i141> @ashr_0_vec_poison_elt(<2 x i141> %X) { +; CHECK-LABEL: @ashr_0_vec_poison_elt( ; CHECK-NEXT: ret <2 x i141> zeroinitializer ; - %B = shl <2 x i141> , %X + %B = shl <2 x i141> , %X ret <2 x i141> %B } @@ -113,11 +113,11 @@ define i32 @ashr_all_ones(i32 %A) { ret i32 %B } -define <3 x i8> @ashr_all_ones_vec_with_undef_elts(<3 x i8> %x, <3 x i8> %y) { -; CHECK-LABEL: @ashr_all_ones_vec_with_undef_elts( +define <3 x i8> @ashr_all_ones_vec_with_poison_elts(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @ashr_all_ones_vec_with_poison_elts( ; CHECK-NEXT: ret <3 x i8> ; - %sh = ashr <3 x i8> , %y + %sh = ashr <3 x i8> , %y ret <3 x i8> %sh } @@ -306,11 +306,22 @@ define <2 x i7> @all_ones_left_right_splat(<2 x i7> %x) { ; Poison could propagate, but undef must not. -define <3 x i7> @all_ones_left_right_splat_poison_undef_elt(<3 x i7> %x) { -; CHECK-LABEL: @all_ones_left_right_splat_poison_undef_elt( +define <3 x i7> @all_ones_left_right_splat_undef_elt(<3 x i7> %x) { +; CHECK-LABEL: @all_ones_left_right_splat_undef_elt( +; CHECK-NEXT: [[LEFT:%.*]] = shl <3 x i7> , [[X:%.*]] +; CHECK-NEXT: [[RIGHT:%.*]] = ashr <3 x i7> [[LEFT]], [[X]] +; CHECK-NEXT: ret <3 x i7> [[RIGHT]] +; + %left = shl <3 x i7> , %x + %right = ashr <3 x i7> %left, %x + ret <3 x i7> %right +} + +define <3 x i7> @all_ones_left_right_splat_poison__elt(<3 x i7> %x) { +; CHECK-LABEL: @all_ones_left_right_splat_poison__elt( ; CHECK-NEXT: ret <3 x i7> ; - %left = shl <3 x i7> , %x + %left = shl <3 x i7> , %x %right = ashr <3 x i7> %left, %x ret <3 x i7> %right } diff --git a/llvm/test/Transforms/InstSimplify/srem.ll b/llvm/test/Transforms/InstSimplify/srem.ll index b1cbdf35b3c7cc..ab726832e517bd 100644 --- a/llvm/test/Transforms/InstSimplify/srem.ll +++ b/llvm/test/Transforms/InstSimplify/srem.ll @@ -39,11 +39,11 @@ define <2 x i32> @knownnegation_commute_vec(<2 x i32> %x, <2 x i32> %y) { ret <2 x i32> %rem } -define <3 x i32> @negated_operand_vec_undef(<3 x i32> %x) { -; CHECK-LABEL: @negated_operand_vec_undef( +define <3 x i32> @negated_operand_vec_poison(<3 x i32> %x) { +; CHECK-LABEL: @negated_operand_vec_poison( ; CHECK-NEXT: ret <3 x i32> zeroinitializer ; - %negx = sub <3 x i32> , %x + %negx = sub <3 x i32> , %x %rem = srem <3 x i32> %negx, %x ret <3 x i32> %rem } diff --git a/llvm/test/Transforms/InstSimplify/sub.ll b/llvm/test/Transforms/InstSimplify/sub.ll index deb0ee33cd9204..fd88fc15716c8c 100644 --- a/llvm/test/Transforms/InstSimplify/sub.ll +++ b/llvm/test/Transforms/InstSimplify/sub.ll @@ -29,7 +29,7 @@ define <2 x i32> @sub_zero_vec(<2 x i32> %A) { ; CHECK-LABEL: @sub_zero_vec( ; CHECK-NEXT: ret <2 x i32> [[A:%.*]] ; - %B = sub <2 x i32> %A, + %B = sub <2 x i32> %A, ret <2 x i32> %B } @@ -46,8 +46,8 @@ define <2 x i32> @neg_neg_vec(<2 x i32> %A) { ; CHECK-LABEL: @neg_neg_vec( ; CHECK-NEXT: ret <2 x i32> [[A:%.*]] ; - %B = sub <2 x i32> , %A - %C = sub <2 x i32> , %B + %B = sub <2 x i32> , %A + %C = sub <2 x i32> , %B ret <2 x i32> %C } diff --git a/llvm/test/Transforms/InstSimplify/xor.ll b/llvm/test/Transforms/InstSimplify/xor.ll index 0e23cc66c1652c..229e943a3836f2 100644 --- a/llvm/test/Transforms/InstSimplify/xor.ll +++ b/llvm/test/Transforms/InstSimplify/xor.ll @@ -156,6 +156,20 @@ define <2 x i4> @xor_and_or_not_undef_elt(<2 x i4> %a, <2 x i4> %b) { ret <2 x i4> %r } +; but correct to propagate poison element + +define <2 x i4> @xor_and_or_not_poison_elt(<2 x i4> %a, <2 x i4> %b) { +; CHECK-LABEL: @xor_and_or_not_poison_elt( +; CHECK-NEXT: [[NOT:%.*]] = xor <2 x i4> [[A:%.*]], +; CHECK-NEXT: ret <2 x i4> [[NOT]] +; + %and = and <2 x i4> %b, %a + %not = xor <2 x i4> %a, + %or = or <2 x i4> %not, %b + %r = xor <2 x i4> %or, %and + ret <2 x i4> %r +} + define i4 @xor_or_and_not_commute0(i4 %a, i4 %b) { ; CHECK-LABEL: @xor_or_and_not_commute0( ; CHECK-NEXT: ret i4 [[A:%.*]] @@ -277,11 +291,11 @@ define i4 @xor_or_and_not_wrong_val2(i4 %a, i4 %b, i4 %c) { ret i4 %r } -define <2 x i4> @xor_or_and_not_undef_elt(<2 x i4> %a, <2 x i4> %b) { -; CHECK-LABEL: @xor_or_and_not_undef_elt( +define <2 x i4> @xor_or_and_not_poison_elt(<2 x i4> %a, <2 x i4> %b) { +; CHECK-LABEL: @xor_or_and_not_poison_elt( ; CHECK-NEXT: ret <2 x i4> [[A:%.*]] ; - %not = xor <2 x i4> %a, + %not = xor <2 x i4> %a, %and = and <2 x i4> %b, %not %or = or <2 x i4> %a, %b %r = xor <2 x i4> %or, %and diff --git a/llvm/test/Transforms/Reassociate/inverses.ll b/llvm/test/Transforms/Reassociate/inverses.ll index b6962c6872a9a3..a9d0c4fb032224 100644 --- a/llvm/test/Transforms/Reassociate/inverses.ll +++ b/llvm/test/Transforms/Reassociate/inverses.ll @@ -12,12 +12,12 @@ define i32 @test1(i32 %a, i32 %b) { ret i32 %t5 } -define <2 x i32> @not_op_vec_undef(<2 x i32> %a, <2 x i32> %b) { -; CHECK-LABEL: @not_op_vec_undef( +define <2 x i32> @not_op_vec_poison(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: @not_op_vec_poison( ; CHECK-NEXT: ret <2 x i32> zeroinitializer ; %t2 = and <2 x i32> %b, %a - %t4 = xor <2 x i32> %a, + %t4 = xor <2 x i32> %a, %t5 = and <2 x i32> %t2, %t4 ret <2 x i32> %t5 } diff --git a/llvm/test/Transforms/Reassociate/negation.ll b/llvm/test/Transforms/Reassociate/negation.ll index 4718d9d87ae1bc..14ae86fb94aaba 100644 --- a/llvm/test/Transforms/Reassociate/negation.ll +++ b/llvm/test/Transforms/Reassociate/negation.ll @@ -31,16 +31,16 @@ define i32 @test2(i32 %a, i32 %b, i32 %z) { ret i32 %f } -define <2 x i32> @negate_vec_undefs(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) { -; CHECK-LABEL: @negate_vec_undefs( +define <2 x i32> @negate_vec_poisons(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) { +; CHECK-LABEL: @negate_vec_poisons( ; CHECK-NEXT: [[E:%.*]] = mul <2 x i32> [[A:%.*]], ; CHECK-NEXT: [[F:%.*]] = mul <2 x i32> [[E]], [[Z:%.*]] ; CHECK-NEXT: ret <2 x i32> [[F]] ; %d = mul <2 x i32> %z, - %c = sub <2 x i32> , %d + %c = sub <2 x i32> , %d %e = mul <2 x i32> %a, %c - %f = sub <2 x i32> , %e + %f = sub <2 x i32> , %e ret <2 x i32> %f } diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp index 1d6a92c498b061..8f0a507c0fd180 100644 --- a/llvm/unittests/IR/ConstantsTest.cpp +++ b/llvm/unittests/IR/ConstantsTest.cpp @@ -581,7 +581,7 @@ TEST(ConstantsTest, containsUndefElemTest) { } } -// Check that undefined elements in vector constants are matched +// Check that poison elements in vector constants are matched // correctly for both integer and floating-point types. Just don't // crash on vectors of pointers (could be handled?). @@ -590,6 +590,7 @@ TEST(ConstantsTest, isElementWiseEqual) { Type *Int32Ty = Type::getInt32Ty(Context); Constant *CU = UndefValue::get(Int32Ty); + Constant *CP = PoisonValue::get(Int32Ty); Constant *C1 = ConstantInt::get(Int32Ty, 1); Constant *C2 = ConstantInt::get(Int32Ty, 2); @@ -597,15 +598,25 @@ TEST(ConstantsTest, isElementWiseEqual) { Constant *C12U1 = ConstantVector::get({C1, C2, CU, C1}); Constant *C12U2 = ConstantVector::get({C1, C2, CU, C2}); Constant *C12U21 = ConstantVector::get({C1, C2, CU, C2, C1}); + Constant *C12P1 = ConstantVector::get({C1, C2, CP, C1}); + Constant *C12P2 = ConstantVector::get({C1, C2, CP, C2}); + Constant *C12P21 = ConstantVector::get({C1, C2, CP, C2, C1}); - EXPECT_TRUE(C1211->isElementWiseEqual(C12U1)); - EXPECT_TRUE(C12U1->isElementWiseEqual(C1211)); + EXPECT_FALSE(C1211->isElementWiseEqual(C12U1)); + EXPECT_FALSE(C12U1->isElementWiseEqual(C1211)); EXPECT_FALSE(C12U2->isElementWiseEqual(C12U1)); EXPECT_FALSE(C12U1->isElementWiseEqual(C12U2)); EXPECT_FALSE(C12U21->isElementWiseEqual(C12U2)); + EXPECT_TRUE(C1211->isElementWiseEqual(C12P1)); + EXPECT_TRUE(C12P1->isElementWiseEqual(C1211)); + EXPECT_FALSE(C12P2->isElementWiseEqual(C12P1)); + EXPECT_FALSE(C12P1->isElementWiseEqual(C12P2)); + EXPECT_FALSE(C12P21->isElementWiseEqual(C12P2)); + Type *FltTy = Type::getFloatTy(Context); Constant *CFU = UndefValue::get(FltTy); + Constant *CFP = PoisonValue::get(FltTy); Constant *CF1 = ConstantFP::get(FltTy, 1.0); Constant *CF2 = ConstantFP::get(FltTy, 2.0); @@ -613,25 +624,41 @@ TEST(ConstantsTest, isElementWiseEqual) { Constant *CF12U1 = ConstantVector::get({CF1, CF2, CFU, CF1}); Constant *CF12U2 = ConstantVector::get({CF1, CF2, CFU, CF2}); Constant *CFUU1U = ConstantVector::get({CFU, CFU, CF1, CFU}); + Constant *CF12P1 = ConstantVector::get({CF1, CF2, CFP, CF1}); + Constant *CF12P2 = ConstantVector::get({CF1, CF2, CFP, CF2}); + Constant *CFPP1P = ConstantVector::get({CFP, CFP, CF1, CFP}); - EXPECT_TRUE(CF1211->isElementWiseEqual(CF12U1)); - EXPECT_TRUE(CF12U1->isElementWiseEqual(CF1211)); - EXPECT_TRUE(CFUU1U->isElementWiseEqual(CF12U1)); + EXPECT_FALSE(CF1211->isElementWiseEqual(CF12U1)); + EXPECT_FALSE(CF12U1->isElementWiseEqual(CF1211)); + EXPECT_FALSE(CFUU1U->isElementWiseEqual(CF12U1)); EXPECT_FALSE(CF12U2->isElementWiseEqual(CF12U1)); EXPECT_FALSE(CF12U1->isElementWiseEqual(CF12U2)); + EXPECT_TRUE(CF1211->isElementWiseEqual(CF12P1)); + EXPECT_TRUE(CF12P1->isElementWiseEqual(CF1211)); + EXPECT_TRUE(CFPP1P->isElementWiseEqual(CF12P1)); + EXPECT_FALSE(CF12P2->isElementWiseEqual(CF12P1)); + EXPECT_FALSE(CF12P1->isElementWiseEqual(CF12P2)); + PointerType *PtrTy = PointerType::get(Context, 0); Constant *CPU = UndefValue::get(PtrTy); + Constant *CPP = PoisonValue::get(PtrTy); Constant *CP0 = ConstantPointerNull::get(PtrTy); Constant *CP0000 = ConstantVector::get({CP0, CP0, CP0, CP0}); Constant *CP00U0 = ConstantVector::get({CP0, CP0, CPU, CP0}); Constant *CP00U = ConstantVector::get({CP0, CP0, CPU}); + Constant *CP00P0 = ConstantVector::get({CP0, CP0, CPP, CP0}); + Constant *CP00P = ConstantVector::get({CP0, CP0, CPP}); EXPECT_FALSE(CP0000->isElementWiseEqual(CP00U0)); EXPECT_FALSE(CP00U0->isElementWiseEqual(CP0000)); EXPECT_FALSE(CP0000->isElementWiseEqual(CP00U)); EXPECT_FALSE(CP00U->isElementWiseEqual(CP00U0)); + EXPECT_FALSE(CP0000->isElementWiseEqual(CP00P0)); + EXPECT_FALSE(CP00P0->isElementWiseEqual(CP0000)); + EXPECT_FALSE(CP0000->isElementWiseEqual(CP00P)); + EXPECT_FALSE(CP00P->isElementWiseEqual(CP00P0)); } // Check that vector/aggregate constants correctly store undef and poison diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp index 4d0c2e4220fec7..133012684d16d8 100644 --- a/llvm/unittests/IR/PatternMatch.cpp +++ b/llvm/unittests/IR/PatternMatch.cpp @@ -1184,6 +1184,8 @@ TEST_F(PatternMatchTest, VectorUndefInt) { Type *VectorTy = FixedVectorType::get(ScalarTy, 4); Constant *ScalarUndef = UndefValue::get(ScalarTy); Constant *VectorUndef = UndefValue::get(VectorTy); + Constant *ScalarPoison = PoisonValue::get(ScalarTy); + Constant *VectorPoison = PoisonValue::get(VectorTy); Constant *ScalarZero = Constant::getNullValue(ScalarTy); Constant *VectorZero = Constant::getNullValue(VectorTy); @@ -1194,17 +1196,30 @@ TEST_F(PatternMatchTest, VectorUndefInt) { Elems.push_back(ScalarZero); Constant *VectorZeroUndef = ConstantVector::get(Elems); + SmallVector Elems2; + Elems2.push_back(ScalarPoison); + Elems2.push_back(ScalarZero); + Elems2.push_back(ScalarPoison); + Elems2.push_back(ScalarZero); + Constant *VectorZeroPoison = ConstantVector::get(Elems2); + EXPECT_TRUE(match(ScalarUndef, m_Undef())); + EXPECT_TRUE(match(ScalarPoison, m_Undef())); EXPECT_TRUE(match(VectorUndef, m_Undef())); + EXPECT_TRUE(match(VectorPoison, m_Undef())); EXPECT_FALSE(match(ScalarZero, m_Undef())); EXPECT_FALSE(match(VectorZero, m_Undef())); EXPECT_FALSE(match(VectorZeroUndef, m_Undef())); + EXPECT_FALSE(match(VectorZeroPoison, m_Undef())); EXPECT_FALSE(match(ScalarUndef, m_Zero())); + EXPECT_FALSE(match(ScalarPoison, m_Zero())); EXPECT_FALSE(match(VectorUndef, m_Zero())); + EXPECT_FALSE(match(VectorPoison, m_Zero())); + EXPECT_FALSE(match(VectorZeroUndef, m_Zero())); EXPECT_TRUE(match(ScalarZero, m_Zero())); EXPECT_TRUE(match(VectorZero, m_Zero())); - EXPECT_TRUE(match(VectorZeroUndef, m_Zero())); + EXPECT_TRUE(match(VectorZeroPoison, m_Zero())); const APInt *C; // Regardless of whether undefs are allowed, @@ -1249,6 +1264,8 @@ TEST_F(PatternMatchTest, VectorUndefFloat) { Type *VectorTy = FixedVectorType::get(ScalarTy, 4); Constant *ScalarUndef = UndefValue::get(ScalarTy); Constant *VectorUndef = UndefValue::get(VectorTy); + Constant *ScalarPoison = PoisonValue::get(ScalarTy); + Constant *VectorPoison = PoisonValue::get(VectorTy); Constant *ScalarZero = Constant::getNullValue(ScalarTy); Constant *VectorZero = Constant::getNullValue(VectorTy); Constant *ScalarPosInf = ConstantFP::getInfinity(ScalarTy, false); @@ -1258,72 +1275,116 @@ TEST_F(PatternMatchTest, VectorUndefFloat) { Constant *VectorZeroUndef = ConstantVector::get({ScalarUndef, ScalarZero, ScalarUndef, ScalarZero}); + Constant *VectorZeroPoison = + ConstantVector::get({ScalarPoison, ScalarZero, ScalarPoison, ScalarZero}); + Constant *VectorInfUndef = ConstantVector::get( {ScalarPosInf, ScalarNegInf, ScalarUndef, ScalarPosInf}); + Constant *VectorInfPoison = ConstantVector::get( + {ScalarPosInf, ScalarNegInf, ScalarPoison, ScalarPosInf}); + Constant *VectorNaNUndef = ConstantVector::get({ScalarUndef, ScalarNaN, ScalarNaN, ScalarNaN}); + Constant *VectorNaNPoison = + ConstantVector::get({ScalarPoison, ScalarNaN, ScalarNaN, ScalarNaN}); + EXPECT_TRUE(match(ScalarUndef, m_Undef())); EXPECT_TRUE(match(VectorUndef, m_Undef())); + EXPECT_TRUE(match(ScalarPoison, m_Undef())); + EXPECT_TRUE(match(VectorPoison, m_Undef())); EXPECT_FALSE(match(ScalarZero, m_Undef())); EXPECT_FALSE(match(VectorZero, m_Undef())); EXPECT_FALSE(match(VectorZeroUndef, m_Undef())); EXPECT_FALSE(match(VectorInfUndef, m_Undef())); EXPECT_FALSE(match(VectorNaNUndef, m_Undef())); + EXPECT_FALSE(match(VectorZeroPoison, m_Undef())); + EXPECT_FALSE(match(VectorInfPoison, m_Undef())); + EXPECT_FALSE(match(VectorNaNPoison, m_Undef())); EXPECT_FALSE(match(ScalarUndef, m_AnyZeroFP())); EXPECT_FALSE(match(VectorUndef, m_AnyZeroFP())); + EXPECT_FALSE(match(ScalarPoison, m_AnyZeroFP())); + EXPECT_FALSE(match(VectorPoison, m_AnyZeroFP())); EXPECT_TRUE(match(ScalarZero, m_AnyZeroFP())); EXPECT_TRUE(match(VectorZero, m_AnyZeroFP())); - EXPECT_TRUE(match(VectorZeroUndef, m_AnyZeroFP())); + EXPECT_FALSE(match(VectorZeroUndef, m_AnyZeroFP())); EXPECT_FALSE(match(VectorInfUndef, m_AnyZeroFP())); EXPECT_FALSE(match(VectorNaNUndef, m_AnyZeroFP())); + EXPECT_TRUE(match(VectorZeroPoison, m_AnyZeroFP())); + EXPECT_FALSE(match(VectorInfPoison, m_AnyZeroFP())); + EXPECT_FALSE(match(VectorNaNPoison, m_AnyZeroFP())); EXPECT_FALSE(match(ScalarUndef, m_NaN())); EXPECT_FALSE(match(VectorUndef, m_NaN())); EXPECT_FALSE(match(VectorZeroUndef, m_NaN())); + EXPECT_FALSE(match(ScalarPoison, m_NaN())); + EXPECT_FALSE(match(VectorPoison, m_NaN())); + EXPECT_FALSE(match(VectorZeroPoison, m_NaN())); EXPECT_FALSE(match(ScalarPosInf, m_NaN())); EXPECT_FALSE(match(ScalarNegInf, m_NaN())); EXPECT_TRUE(match(ScalarNaN, m_NaN())); EXPECT_FALSE(match(VectorInfUndef, m_NaN())); - EXPECT_TRUE(match(VectorNaNUndef, m_NaN())); + EXPECT_FALSE(match(VectorNaNUndef, m_NaN())); + EXPECT_FALSE(match(VectorInfPoison, m_NaN())); + EXPECT_TRUE(match(VectorNaNPoison, m_NaN())); EXPECT_FALSE(match(ScalarUndef, m_NonNaN())); EXPECT_FALSE(match(VectorUndef, m_NonNaN())); - EXPECT_TRUE(match(VectorZeroUndef, m_NonNaN())); + EXPECT_FALSE(match(VectorZeroUndef, m_NonNaN())); + EXPECT_FALSE(match(ScalarPoison, m_NonNaN())); + EXPECT_FALSE(match(VectorPoison, m_NonNaN())); + EXPECT_TRUE(match(VectorZeroPoison, m_NonNaN())); EXPECT_TRUE(match(ScalarPosInf, m_NonNaN())); EXPECT_TRUE(match(ScalarNegInf, m_NonNaN())); EXPECT_FALSE(match(ScalarNaN, m_NonNaN())); - EXPECT_TRUE(match(VectorInfUndef, m_NonNaN())); + EXPECT_FALSE(match(VectorInfUndef, m_NonNaN())); EXPECT_FALSE(match(VectorNaNUndef, m_NonNaN())); + EXPECT_TRUE(match(VectorInfPoison, m_NonNaN())); + EXPECT_FALSE(match(VectorNaNPoison, m_NonNaN())); EXPECT_FALSE(match(ScalarUndef, m_Inf())); EXPECT_FALSE(match(VectorUndef, m_Inf())); EXPECT_FALSE(match(VectorZeroUndef, m_Inf())); + EXPECT_FALSE(match(ScalarPoison, m_Inf())); + EXPECT_FALSE(match(VectorPoison, m_Inf())); + EXPECT_FALSE(match(VectorZeroPoison, m_Inf())); EXPECT_TRUE(match(ScalarPosInf, m_Inf())); EXPECT_TRUE(match(ScalarNegInf, m_Inf())); EXPECT_FALSE(match(ScalarNaN, m_Inf())); - EXPECT_TRUE(match(VectorInfUndef, m_Inf())); + EXPECT_FALSE(match(VectorInfUndef, m_Inf())); EXPECT_FALSE(match(VectorNaNUndef, m_Inf())); + EXPECT_TRUE(match(VectorInfPoison, m_Inf())); + EXPECT_FALSE(match(VectorNaNPoison, m_Inf())); EXPECT_FALSE(match(ScalarUndef, m_NonInf())); EXPECT_FALSE(match(VectorUndef, m_NonInf())); - EXPECT_TRUE(match(VectorZeroUndef, m_NonInf())); + EXPECT_FALSE(match(VectorZeroUndef, m_NonInf())); + EXPECT_FALSE(match(ScalarPoison, m_NonInf())); + EXPECT_FALSE(match(VectorPoison, m_NonInf())); + EXPECT_TRUE(match(VectorZeroPoison, m_NonInf())); EXPECT_FALSE(match(ScalarPosInf, m_NonInf())); EXPECT_FALSE(match(ScalarNegInf, m_NonInf())); EXPECT_TRUE(match(ScalarNaN, m_NonInf())); EXPECT_FALSE(match(VectorInfUndef, m_NonInf())); - EXPECT_TRUE(match(VectorNaNUndef, m_NonInf())); + EXPECT_FALSE(match(VectorNaNUndef, m_NonInf())); + EXPECT_FALSE(match(VectorInfPoison, m_NonInf())); + EXPECT_TRUE(match(VectorNaNPoison, m_NonInf())); EXPECT_FALSE(match(ScalarUndef, m_Finite())); EXPECT_FALSE(match(VectorUndef, m_Finite())); - EXPECT_TRUE(match(VectorZeroUndef, m_Finite())); + EXPECT_FALSE(match(VectorZeroUndef, m_Finite())); + EXPECT_FALSE(match(ScalarPoison, m_Finite())); + EXPECT_FALSE(match(VectorPoison, m_Finite())); + EXPECT_TRUE(match(VectorZeroPoison, m_Finite())); EXPECT_FALSE(match(ScalarPosInf, m_Finite())); EXPECT_FALSE(match(ScalarNegInf, m_Finite())); EXPECT_FALSE(match(ScalarNaN, m_Finite())); EXPECT_FALSE(match(VectorInfUndef, m_Finite())); EXPECT_FALSE(match(VectorNaNUndef, m_Finite())); + EXPECT_FALSE(match(VectorInfPoison, m_Finite())); + EXPECT_FALSE(match(VectorNaNPoison, m_Finite())); const APFloat *C; // Regardless of whether undefs are allowed, @@ -1707,38 +1768,57 @@ TEST_F(PatternMatchTest, ConstantPredicateType) { Constant *CMixedU32 = ConstantVector::get({CU32Max, CU32Zero, CU32DeadBeef}); Constant *CU32Undef = UndefValue::get(U32Ty); + Constant *CU32Poison = PoisonValue::get(U32Ty); Constant *CU32MaxWithUndef = ConstantVector::get({CU32Undef, CU32Max, CU32Undef}); + Constant *CU32MaxWithPoison = + ConstantVector::get({CU32Poison, CU32Max, CU32Poison}); EXPECT_FALSE(match(CMixedU32, cst_pred_ty())); EXPECT_FALSE(match(CMixedU32, cst_pred_ty())); EXPECT_TRUE(match(CMixedU32, cst_pred_ty>())); EXPECT_FALSE(match(CMixedU32, cst_pred_ty>())); - EXPECT_TRUE(match(CU32MaxWithUndef, cst_pred_ty())); + EXPECT_FALSE(match(CU32MaxWithUndef, cst_pred_ty())); EXPECT_FALSE(match(CU32MaxWithUndef, cst_pred_ty())); - EXPECT_TRUE(match(CU32MaxWithUndef, cst_pred_ty>())); + EXPECT_FALSE(match(CU32MaxWithUndef, cst_pred_ty>())); EXPECT_FALSE( match(CU32MaxWithUndef, cst_pred_ty>())); + EXPECT_TRUE(match(CU32MaxWithPoison, cst_pred_ty())); + EXPECT_FALSE(match(CU32MaxWithPoison, cst_pred_ty())); + EXPECT_TRUE(match(CU32MaxWithPoison, cst_pred_ty>())); + EXPECT_FALSE( + match(CU32MaxWithPoison, cst_pred_ty>())); + // Float arbitrary vector Constant *CMixedF32 = ConstantVector::get({CF32NaN, CF32Zero, CF32Pi}); Constant *CF32Undef = UndefValue::get(F32Ty); + Constant *CF32Poison = PoisonValue::get(F32Ty); Constant *CF32NaNWithUndef = ConstantVector::get({CF32Undef, CF32NaN, CF32Undef}); + Constant *CF32NaNWithPoison = + ConstantVector::get({CF32Poison, CF32NaN, CF32Poison}); EXPECT_FALSE(match(CMixedF32, cstfp_pred_ty())); EXPECT_FALSE(match(CMixedF32, cstfp_pred_ty())); EXPECT_TRUE(match(CMixedF32, cstfp_pred_ty>())); EXPECT_FALSE(match(CMixedF32, cstfp_pred_ty>())); - EXPECT_TRUE(match(CF32NaNWithUndef, cstfp_pred_ty())); + EXPECT_FALSE(match(CF32NaNWithUndef, cstfp_pred_ty())); EXPECT_FALSE(match(CF32NaNWithUndef, cstfp_pred_ty())); - EXPECT_TRUE( + EXPECT_FALSE( match(CF32NaNWithUndef, cstfp_pred_ty>())); EXPECT_FALSE( match(CF32NaNWithUndef, cstfp_pred_ty>())); + + EXPECT_TRUE(match(CF32NaNWithPoison, cstfp_pred_ty())); + EXPECT_FALSE(match(CF32NaNWithPoison, cstfp_pred_ty())); + EXPECT_TRUE( + match(CF32NaNWithPoison, cstfp_pred_ty>())); + EXPECT_FALSE( + match(CF32NaNWithPoison, cstfp_pred_ty>())); } TEST_F(PatternMatchTest, InsertValue) { @@ -1888,35 +1968,44 @@ TEST_F(PatternMatchTest, NotForbidUndef) { Type *ScalarTy = IRB.getInt8Ty(); Type *VectorTy = FixedVectorType::get(ScalarTy, 3); Constant *ScalarUndef = UndefValue::get(ScalarTy); + Constant *ScalarPoison = PoisonValue::get(ScalarTy); Constant *ScalarOnes = Constant::getAllOnesValue(ScalarTy); Constant *VectorZero = Constant::getNullValue(VectorTy); Constant *VectorOnes = Constant::getAllOnesValue(VectorTy); - SmallVector MixedElems; - MixedElems.push_back(ScalarOnes); - MixedElems.push_back(ScalarOnes); - MixedElems.push_back(ScalarUndef); - Constant *VectorMixed = ConstantVector::get(MixedElems); + SmallVector MixedElemsUndef; + MixedElemsUndef.push_back(ScalarOnes); + MixedElemsUndef.push_back(ScalarOnes); + MixedElemsUndef.push_back(ScalarUndef); + Constant *VectorMixedUndef = ConstantVector::get(MixedElemsUndef); + + SmallVector MixedElemsPoison; + MixedElemsPoison.push_back(ScalarOnes); + MixedElemsPoison.push_back(ScalarOnes); + MixedElemsPoison.push_back(ScalarPoison); + Constant *VectorMixedPoison = ConstantVector::get(MixedElemsPoison); Value *Not = IRB.CreateXor(VectorZero, VectorOnes); Value *X; - EXPECT_TRUE(match(Not, m_Not(m_Value()))); - EXPECT_TRUE(match(Not, m_NotForbidUndef(m_Value(X)))); + EXPECT_TRUE(match(Not, m_Not(m_Value(X)))); EXPECT_TRUE(match(X, m_Zero())); Value *NotCommute = IRB.CreateXor(VectorOnes, VectorZero); Value *Y; - EXPECT_TRUE(match(NotCommute, m_Not(m_Value()))); - EXPECT_TRUE(match(NotCommute, m_NotForbidUndef(m_Value(Y)))); + EXPECT_TRUE(match(NotCommute, m_Not(m_Value(Y)))); EXPECT_TRUE(match(Y, m_Zero())); - Value *NotWithUndefs = IRB.CreateXor(VectorZero, VectorMixed); - EXPECT_TRUE(match(NotWithUndefs, m_Not(m_Value()))); - EXPECT_FALSE(match(NotWithUndefs, m_NotForbidUndef(m_Value()))); + Value *NotWithUndefs = IRB.CreateXor(VectorZero, VectorMixedUndef); + EXPECT_FALSE(match(NotWithUndefs, m_Not(m_Value()))); + + Value *NotWithPoisons = IRB.CreateXor(VectorZero, VectorMixedPoison); + EXPECT_TRUE(match(NotWithPoisons, m_Not(m_Value()))); + + Value *NotWithUndefsCommute = IRB.CreateXor(VectorMixedUndef, VectorZero); + EXPECT_FALSE(match(NotWithUndefsCommute, m_Not(m_Value()))); - Value *NotWithUndefsCommute = IRB.CreateXor(VectorMixed, VectorZero); - EXPECT_TRUE(match(NotWithUndefsCommute, m_Not(m_Value()))); - EXPECT_FALSE(match(NotWithUndefsCommute, m_NotForbidUndef(m_Value(X)))); + Value *NotWithPoisonsCommute = IRB.CreateXor(VectorMixedPoison, VectorZero); + EXPECT_TRUE(match(NotWithPoisonsCommute, m_Not(m_Value()))); } template struct MutableConstTest : PatternMatchTest { }; From 971237dab259bdaa619403fc6472bd1758d4dc18 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Wed, 17 Apr 2024 11:31:29 +0200 Subject: [PATCH 226/300] [flang] Retain internal and BIND(C) host procedure link in FIR (#87796) Currently, it is not possible to find back which fun.func is the host procedure of some internal procedure because the mangling of the internal procedure does not contain info about the BIND(C) name of the host. This info may be useful to ensure dwarf DW_TAG_subprogram of internal procedures are nested under DW_TAG_subprogram of host procedures for instance. --- flang/include/flang/Lower/CallInterface.h | 4 -- .../flang/Optimizer/Dialect/FIROpsSupport.h | 10 ++--- flang/lib/Lower/CallInterface.cpp | 40 +++++++++++++------ .../Optimizer/Transforms/ArrayValueCopy.cpp | 2 +- .../HLFIR/internal-procedures-bindc-host.f90 | 39 ++++++++++++++++++ .../test/Lower/HLFIR/internal-procedures.f90 | 6 +-- flang/test/Lower/OpenACC/acc-routine04.f90 | 2 +- .../threadprivate-host-association-2.f90 | 2 +- .../OpenMP/threadprivate-host-association.f90 | 2 +- flang/test/Lower/character-elemental.f90 | 15 ++++--- .../Lower/equivalence-with-host-assoc.f90 | 16 ++++---- .../Lower/explicit-interface-results-2.f90 | 4 +- .../test/Lower/host-associated-functions.f90 | 6 +-- flang/test/Lower/host-associated-globals.f90 | 6 +-- flang/test/Lower/host-associated.f90 | 32 +++++++-------- flang/test/Lower/polymorphic.f90 | 2 +- 16 files changed, 119 insertions(+), 69 deletions(-) create mode 100644 flang/test/Lower/HLFIR/internal-procedures-bindc-host.f90 diff --git a/flang/include/flang/Lower/CallInterface.h b/flang/include/flang/Lower/CallInterface.h index 80b05764253778..a11e81b6593de1 100644 --- a/flang/include/flang/Lower/CallInterface.h +++ b/flang/include/flang/Lower/CallInterface.h @@ -391,9 +391,6 @@ class CallerInterface : public CallInterface { llvm_unreachable("getting host associated type in CallerInterface"); } - /// Set attributes on MLIR function. - void setFuncAttrs(mlir::func::FuncOp) const {} - private: /// Check that the input vector is complete. bool verifyActualInputs() const; @@ -444,7 +441,6 @@ class CalleeInterface : public CallInterface { bool hasHostAssociated() const; mlir::Type getHostAssociatedTy() const; mlir::Value getHostAssociatedTuple() const; - void setFuncAttrs(mlir::func::FuncOp) const; private: Fortran::lower::pft::FunctionLikeUnit &funit; diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h index 3266ea3aa7fdc6..46b62d8de8d379 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h +++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h @@ -104,9 +104,9 @@ static constexpr llvm::StringRef getHostAssocAttrName() { return "fir.host_assoc"; } -/// Attribute to mark an internal procedure. -static constexpr llvm::StringRef getInternalProcedureAttrName() { - return "fir.internal_proc"; +/// Attribute to link an internal procedure to its host procedure symbol. +static constexpr llvm::StringRef getHostSymbolAttrName() { + return "fir.host_symbol"; } /// Attribute containing the original name of a function from before the @@ -122,8 +122,8 @@ bool hasHostAssociationArgument(mlir::func::FuncOp func); /// Is the function, \p func an internal procedure ? /// Some internal procedures may have access to saved host procedure /// variables even when they do not have a tuple argument. -inline bool isInternalPorcedure(mlir::func::FuncOp func) { - return func->hasAttr(fir::getInternalProcedureAttrName()); +inline bool isInternalProcedure(mlir::func::FuncOp func) { + return func->hasAttr(fir::getHostSymbolAttrName()); } /// Tell if \p value is: diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp index 05a0c10c709749..2d4d17a2ef12e9 100644 --- a/flang/lib/Lower/CallInterface.cpp +++ b/flang/lib/Lower/CallInterface.cpp @@ -575,13 +575,6 @@ mlir::Value Fortran::lower::CalleeInterface::getHostAssociatedTuple() const { return converter.hostAssocTupleValue(); } -void Fortran::lower::CalleeInterface::setFuncAttrs( - mlir::func::FuncOp func) const { - if (funit.parentHasHostAssoc()) - func->setAttr(fir::getInternalProcedureAttrName(), - mlir::UnitAttr::get(func->getContext())); -} - //===----------------------------------------------------------------------===// // CallInterface implementation: this part is common to both caller and callee. //===----------------------------------------------------------------------===// @@ -589,6 +582,34 @@ void Fortran::lower::CalleeInterface::setFuncAttrs( static void addSymbolAttribute(mlir::func::FuncOp func, const Fortran::semantics::Symbol &sym, mlir::MLIRContext &mlirContext) { + const Fortran::semantics::Symbol &ultimate = sym.GetUltimate(); + // The link between an internal procedure and its host procedure is lost + // in FIR if the host is BIND(C) since the internal mangling will not + // allow retrieving the host bind(C) name, and therefore func.func symbol. + // Preserve it as an attribute so that this can be later retrieved. + if (Fortran::semantics::ClassifyProcedure(ultimate) == + Fortran::semantics::ProcedureDefinitionClass::Internal) { + if (ultimate.owner().kind() == + Fortran::semantics::Scope::Kind::Subprogram) { + if (const Fortran::semantics::Symbol *hostProcedure = + ultimate.owner().symbol()) { + std::string hostName = Fortran::lower::mangle::mangleName( + *hostProcedure, /*keepExternalInScope=*/true); + func->setAttr( + fir::getHostSymbolAttrName(), + mlir::SymbolRefAttr::get( + &mlirContext, mlir::StringAttr::get(&mlirContext, hostName))); + } + } else if (ultimate.owner().kind() == + Fortran::semantics::Scope::Kind::MainProgram) { + func->setAttr(fir::getHostSymbolAttrName(), + mlir::SymbolRefAttr::get( + &mlirContext, + mlir::StringAttr::get( + &mlirContext, fir::NameUniquer::doProgramEntry()))); + } + } + // Only add this on bind(C) functions for which the symbol is not reflected in // the current context. if (!Fortran::semantics::IsBindCProcedure(sym)) @@ -686,7 +707,6 @@ void Fortran::lower::CallInterface::declare() { for (const auto &placeHolder : llvm::enumerate(inputs)) if (!placeHolder.value().attributes.empty()) func.setArgAttrs(placeHolder.index(), placeHolder.value().attributes); - side().setFuncAttrs(func); setCUDAAttributes(func, side().getProcedureSymbol(), characteristic); } @@ -1599,10 +1619,6 @@ class SignatureBuilder return proc; } - /// Set internal procedure attribute on MLIR function. Internal procedure - /// are defined in the current file and will not go through SignatureBuilder. - void setFuncAttrs(mlir::func::FuncOp) const {} - /// This is not the description of an indirect call. static constexpr bool isIndirectCall() { return false; } diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp index 675314ed9da038..18ca5711bfea89 100644 --- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp +++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp @@ -728,7 +728,7 @@ conservativeCallConflict(llvm::ArrayRef reaches) { if (auto callee = call.getCallableForCallee().dyn_cast()) { auto module = op->getParentOfType(); - return isInternalPorcedure( + return isInternalProcedure( module.lookupSymbol(callee)); } return false; diff --git a/flang/test/Lower/HLFIR/internal-procedures-bindc-host.f90 b/flang/test/Lower/HLFIR/internal-procedures-bindc-host.f90 new file mode 100644 index 00000000000000..07f60b98b0941f --- /dev/null +++ b/flang/test/Lower/HLFIR/internal-procedures-bindc-host.f90 @@ -0,0 +1,39 @@ +! Test fir.host_sym attribute to retain link between internal +! and host procedure in FIR even when BIND(C) is involved. + +! RUN: bbc -emit-hlfir -o - %s | FileCheck %s +! RUN: bbc -emit-hlfir -o - %s | fir-opt -external-name-interop -o - |FileCheck %s --check-prefix=AFTER_RENAME_PASS + +subroutine foo() bind(c, name="some_c_name") + call bar() +contains + subroutine bar() + end subroutine +end subroutine +! CHECK: func.func @some_c_name() +! CHECK: func.func private @_QFfooPbar() attributes {fir.host_symbol = @some_c_name, llvm.linkage = #llvm.linkage} +! AFTER_RENAME_PASS: func.func @some_c_name() +! AFTER_RENAME_PASS: func.func private @_QFfooPbar() attributes {fir.host_symbol = @some_c_name, llvm.linkage = #llvm.linkage} + +subroutine notbindc() + call bar() +contains + subroutine bar() + end subroutine +end subroutine +! CHECK: func.func @_QPnotbindc() +! CHECK: func.func private @_QFnotbindcPbar() attributes {fir.host_symbol = @_QPnotbindc, llvm.linkage = #llvm.linkage} +! AFTER_RENAME_PASS: func.func @notbindc_() attributes {fir.internal_name = "_QPnotbindc"} +! AFTER_RENAME_PASS: func.func private @_QFnotbindcPbar() attributes {fir.host_symbol = @notbindc_, llvm.linkage = #llvm.linkage} + + +! Main program +call bar() +contains + subroutine bar() + end subroutine +end +! CHECK: func.func @_QQmain() +! CHECK: func.func private @_QFPbar() attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage} +! AFTER_RENAME_PASS: func.func @_QQmain() +! AFTER_RENAME_PASS: func.func private @_QFPbar() attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage} diff --git a/flang/test/Lower/HLFIR/internal-procedures.f90 b/flang/test/Lower/HLFIR/internal-procedures.f90 index c898903b6fbe11..fff7125897ddfe 100644 --- a/flang/test/Lower/HLFIR/internal-procedures.f90 +++ b/flang/test/Lower/HLFIR/internal-procedures.f90 @@ -10,7 +10,7 @@ subroutine internal end subroutine end subroutine ! CHECK-LABEL: func.func private @_QFtest_explicit_shape_arrayPinternal( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref>>>, i32) -> !fir.ref>> ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref>> @@ -28,7 +28,7 @@ subroutine internal end subroutine end subroutine ! CHECK-LABEL: func.func private @_QFtest_assumed_shapePinternal( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref>>>, i32) -> !fir.ref>> ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref>> @@ -45,7 +45,7 @@ subroutine internal() end subroutine end subroutine ! CHECK-LABEL: func.func private @_QFtest_scalar_charPinternal( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref>>, i32) -> !fir.ref> ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref> diff --git a/flang/test/Lower/OpenACC/acc-routine04.f90 b/flang/test/Lower/OpenACC/acc-routine04.f90 index 2339c23eaaf857..f6033761639013 100644 --- a/flang/test/Lower/OpenACC/acc-routine04.f90 +++ b/flang/test/Lower/OpenACC/acc-routine04.f90 @@ -31,4 +31,4 @@ subroutine sub2() ! CHECK: acc.routine @acc_routine_0 func(@_QMdummy_modPsub1) seq ! CHECK: func.func @_QMdummy_modPsub1(%arg0: !fir.ref {fir.bindc_name = "i"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>} ! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test_acc_routine"} -! CHECK: func.func private @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>, llvm.linkage = #llvm.linkage} +! CHECK: func.func private @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>, fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage} diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 index b47bff5bebb0b2..a8d29baf74f220 100644 --- a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 +++ b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 @@ -12,7 +12,7 @@ !CHECK: fir.call @_QFPsub() fastmath : () -> () !CHECK: return !CHECK: } -!CHECK: func.func private @_QFPsub() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +!CHECK: func.func private @_QFPsub() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { !CHECK: %[[A:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"} !CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[A_ADDR:.*]] = fir.address_of(@_QFEa) : !fir.ref diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association.f90 index 98f7b51bb97115..096e510c19c690 100644 --- a/flang/test/Lower/OpenMP/threadprivate-host-association.f90 +++ b/flang/test/Lower/OpenMP/threadprivate-host-association.f90 @@ -11,7 +11,7 @@ !CHECK: fir.call @_QFPsub() fastmath : () -> () !CHECK: return !CHECK: } -!CHECK: func.func private @_QFPsub() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +!CHECK: func.func private @_QFPsub() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { !CHECK: %[[A:.*]] = fir.address_of(@_QFEa) : !fir.ref !CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TP_A:.*]] = omp.threadprivate %[[A_DECL]]#1 : !fir.ref -> !fir.ref diff --git a/flang/test/Lower/character-elemental.f90 b/flang/test/Lower/character-elemental.f90 index 6c46454176f536..9a9cf8bf2d9c63 100644 --- a/flang/test/Lower/character-elemental.f90 +++ b/flang/test/Lower/character-elemental.f90 @@ -5,6 +5,12 @@ subroutine substring_main character*7 :: string(2) = ['12 ', '12 '] integer :: result(2) integer :: ival +interface + elemental function inner(arg) + character(len=*), intent(in) :: arg + integer :: inner + end function inner +end interface ival = 1 ! CHECK: %[[a0:.*]] = fir.alloca i32 {bindc_name = "ival", uniq_name = "_QFsubstring_mainEival"} @@ -26,14 +32,7 @@ subroutine substring_main ! CHECK: %[[a14:.*]] = fir.coordinate_of %[[a13]], %[[a12]] : (!fir.ref>>, index) -> !fir.ref> ! CHECK: %[[a15:.*]] = fir.convert %[[a14]] : (!fir.ref>) -> !fir.ref> ! CHECK: %[[a16:.*]] = fir.emboxchar %[[a15]], {{.*}} : (!fir.ref>, index) -> !fir.boxchar<1> - ! CHECK: %[[a17:.*]] = fir.call @_QFsubstring_mainPinner(%[[a16]]) {{.*}}: (!fir.boxchar<1>) -> i32 + ! CHECK: %[[a17:.*]] = fir.call @_QPinner(%[[a16]]) {{.*}}: (!fir.boxchar<1>) -> i32 result = inner(string(1:2)(ival:ival)) print *, result -contains - elemental function inner(arg) - character(len=*), intent(in) :: arg - integer :: inner - - inner = len(arg) - end function inner end subroutine substring_main diff --git a/flang/test/Lower/equivalence-with-host-assoc.f90 b/flang/test/Lower/equivalence-with-host-assoc.f90 index 0ffb1bc5bf9ee1..b8ce72f3787c0d 100644 --- a/flang/test/Lower/equivalence-with-host-assoc.f90 +++ b/flang/test/Lower/equivalence-with-host-assoc.f90 @@ -10,7 +10,7 @@ subroutine inner i1 = j1 end subroutine inner end subroutine test1 -! FIR-LABEL: func.func private @_QFtest1Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! FIR-LABEL: func.func private @_QFtest1Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! FIR: %[[VAL_0:.*]] = fir.address_of(@_QFtest1Ei1) : !fir.ref> ! FIR: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref> ! FIR: %[[VAL_2:.*]] = arith.constant 0 : index @@ -24,7 +24,7 @@ end subroutine test1 ! FIR: return ! FIR: } -! HLFIR-LABEL: func.func private @_QFtest1Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! HLFIR-LABEL: func.func private @_QFtest1Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! HLFIR: %[[VAL_0:.*]] = fir.address_of(@_QFtest1Ei1) : !fir.ref> ! HLFIR: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref> ! HLFIR: %[[VAL_2:.*]] = arith.constant 0 : index @@ -54,7 +54,7 @@ subroutine inner end subroutine inner end subroutine host end module test2 -! FIR-LABEL: func.func private @_QMtest2FhostPinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! FIR-LABEL: func.func private @_QMtest2FhostPinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! FIR: %[[VAL_0:.*]] = fir.address_of(@_QMtest2FhostEf1) : !fir.ref> ! FIR: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref> ! FIR: %[[VAL_2:.*]] = arith.constant 0 : index @@ -68,7 +68,7 @@ end module test2 ! FIR: return ! FIR: } -! HLFIR-LABEL: func.func private @_QMtest2FhostPinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! HLFIR-LABEL: func.func private @_QMtest2FhostPinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! HLFIR: %[[VAL_0:.*]] = fir.address_of(@_QMtest2FhostEf1) : !fir.ref> ! HLFIR: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref> ! HLFIR: %[[VAL_2:.*]] = arith.constant 0 : index @@ -94,7 +94,7 @@ subroutine inner i1 = j1 + k1 end subroutine inner end subroutine test3 -! FIR-LABEL: func.func private @_QFtest3Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! FIR-LABEL: func.func private @_QFtest3Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! FIR: %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref> ! FIR: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref> ! FIR: %[[VAL_2:.*]] = arith.constant 0 : index @@ -115,7 +115,7 @@ end subroutine test3 ! FIR: return ! FIR: } -! HLFIR-LABEL: func.func private @_QFtest3Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! HLFIR-LABEL: func.func private @_QFtest3Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! HLFIR: %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref> ! HLFIR: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref> ! HLFIR: %[[VAL_2:.*]] = arith.constant 0 : index @@ -149,7 +149,7 @@ subroutine inner i1 = j1 + k1 end subroutine inner end subroutine test4 -! FIR-LABEL: func.func private @_QFtest4Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! FIR-LABEL: func.func private @_QFtest4Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! FIR: %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref> ! FIR: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref> ! FIR: %[[VAL_2:.*]] = arith.constant 0 : index @@ -170,7 +170,7 @@ end subroutine test4 ! FIR: return ! FIR: } -! HLFIR-LABEL: func.func private @_QFtest4Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! HLFIR-LABEL: func.func private @_QFtest4Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! HLFIR: %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref> ! HLFIR: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref> ! HLFIR: %[[VAL_2:.*]] = arith.constant 0 : index diff --git a/flang/test/Lower/explicit-interface-results-2.f90 b/flang/test/Lower/explicit-interface-results-2.f90 index 86aae720e7fcf9..a63ee5fc91794d 100644 --- a/flang/test/Lower/explicit-interface-results-2.f90 +++ b/flang/test/Lower/explicit-interface-results-2.f90 @@ -70,7 +70,7 @@ subroutine host4() call internal_proc_a() contains ! CHECK-LABEL: func private @_QFhost4Pinternal_proc_a -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine internal_proc_a() call takes_array(return_array()) ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 @@ -94,7 +94,7 @@ subroutine host5() implicit none call internal_proc_a() contains -! CHECK-LABEL: func private @_QFhost5Pinternal_proc_a() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-LABEL: func private @_QFhost5Pinternal_proc_a() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine internal_proc_a() call takes_array(return_array()) ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QMsome_moduleEn_module) : !fir.ref diff --git a/flang/test/Lower/host-associated-functions.f90 b/flang/test/Lower/host-associated-functions.f90 index 78d081748c2f42..d67a74fa399804 100644 --- a/flang/test/Lower/host-associated-functions.f90 +++ b/flang/test/Lower/host-associated-functions.f90 @@ -20,7 +20,7 @@ subroutine capture_char_func_dummy(char_func_dummy, n) call internal() contains ! CHECK-LABEL: func private @_QFcapture_char_func_dummyPinternal( - ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref ()>, i64>, !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { + ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref ()>, i64>, !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine internal() ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref ()>, i64>, !fir.ref>>, i32) -> !fir.ref ()>, i64>> @@ -56,7 +56,7 @@ subroutine capture_char_func_assumed_dummy(char_func_dummy) call internal() contains ! CHECK-LABEL: func private @_QFcapture_char_func_assumed_dummyPinternal( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref ()>, i64>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref ()>, i64>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine internal() ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref ()>, i64>>>, i32) -> !fir.ref ()>, i64>> @@ -110,7 +110,7 @@ function array_func() contains subroutine internal() ! CHECK-LABEL: func private @_QFcapture_array_funcPinternal( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref>>, i32) -> !fir.llvm_ptr> ! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.llvm_ptr> diff --git a/flang/test/Lower/host-associated-globals.f90 b/flang/test/Lower/host-associated-globals.f90 index fe612e777aeaad..c91a5a46af0d5f 100644 --- a/flang/test/Lower/host-associated-globals.f90 +++ b/flang/test/Lower/host-associated-globals.f90 @@ -37,7 +37,7 @@ subroutine bar() print *, j_in_equiv, not_in_equiv end subroutine end subroutine -! CHECK-LABEL: func.func private @_QFtest_commonPbar() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-LABEL: func.func private @_QFtest_commonPbar() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@x_) : !fir.ref> ! CHECK: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref> ! CHECK: %[[VAL_2:.*]] = arith.constant 4 : index @@ -59,7 +59,7 @@ subroutine bar() print *, j_in_equiv, not_in_equiv end subroutine end subroutine -! CHECK-LABEL: func.func private @_QFsaved_equivPbar() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-LABEL: func.func private @_QFsaved_equivPbar() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFsaved_equivEi) : !fir.ref> ! CHECK: %[[VAL_1:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref>, index) -> !fir.ref @@ -80,7 +80,7 @@ subroutine bar() end subroutine end subroutine ! CHECK-LABEL: func.func private @_QFmixed_capturePbar( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK: %[[VAL_1:.*]] = fir.address_of(@_QFmixed_captureEsaved_i) : !fir.ref> ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref>, index) -> !fir.ref diff --git a/flang/test/Lower/host-associated.f90 b/flang/test/Lower/host-associated.f90 index f88903c8af80f8..cdc7e6a05288a7 100644 --- a/flang/test/Lower/host-associated.f90 +++ b/flang/test/Lower/host-associated.f90 @@ -20,7 +20,7 @@ subroutine test1 print *, i contains ! CHECK-LABEL: func private @_QFtest1Ptest1_internal( - ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { + ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK: %[[iaddr:.*]] = fir.coordinate_of %[[arg]], %c0 ! CHECK: %[[i:.*]] = fir.load %[[iaddr]] : !fir.llvm_ptr> ! CHECK: %[[val:.*]] = fir.call @_QPifoo() {{.*}}: () -> i32 @@ -47,7 +47,7 @@ subroutine test2 print *, a, b contains ! CHECK-LABEL: func private @_QFtest2Ptest2_internal( - ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref, !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { + ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref, !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine test2_internal ! CHECK: %[[a:.*]] = fir.coordinate_of %[[arg]], %c0 ! CHECK: %[[aa:.*]] = fir.load %[[a]] : !fir.llvm_ptr> @@ -62,7 +62,7 @@ subroutine test2_internal end subroutine test2_internal ! CHECK-LABEL: func private @_QFtest2Ptest2_inner( - ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref, !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { + ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref, !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine test2_inner ! CHECK: %[[a:.*]] = fir.coordinate_of %[[arg]], %c0 ! CHECK: %[[aa:.*]] = fir.load %[[a]] : !fir.llvm_ptr> @@ -96,7 +96,7 @@ subroutine test6(c) contains ! CHECK-LABEL: func private @_QFtest6Ptest6_inner( - ! CHECK-SAME: %[[tup:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { + ! CHECK-SAME: %[[tup:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine test6_inner ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref>>, i32) -> !fir.ref> ! CHECK: %[[load:.*]] = fir.load %[[coor]] : !fir.ref> @@ -138,7 +138,7 @@ subroutine test3(p,q,i) contains ! CHECK-LABEL: func private @_QFtest3Ptest3_inner( - ! CHECK-SAME: %[[tup:.*]]: !fir.ref>, !fir.box>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { + ! CHECK-SAME: %[[tup:.*]]: !fir.ref>, !fir.box>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine test3_inner ! CHECK: %[[pcoor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref>, !fir.box>>>, i32) -> !fir.ref>> ! CHECK: %[[p:.*]] = fir.load %[[pcoor]] : !fir.ref>> @@ -185,7 +185,7 @@ subroutine test3a(p) contains ! CHECK: func private @_QFtest3aPtest3a_inner( - ! CHECK-SAME: %[[tup:.*]]: !fir.ref>, !fir.box>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { + ! CHECK-SAME: %[[tup:.*]]: !fir.ref>, !fir.box>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine test3a_inner ! CHECK: %[[pcoor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref>, !fir.box>>>, i32) -> !fir.ref>> ! CHECK: %[[p:.*]] = fir.load %[[pcoor]] : !fir.ref>> @@ -229,7 +229,7 @@ subroutine test4 contains ! CHECK-LABEL: func private @_QFtest4Ptest4_inner( - ! CHECK-SAME:%[[tup:.*]]: !fir.ref>>, !fir.ref>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { + ! CHECK-SAME:%[[tup:.*]]: !fir.ref>>, !fir.ref>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine test4_inner ! CHECK: %[[ptup:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref>>, !fir.ref>>>>, i32) -> !fir.llvm_ptr>>> ! CHECK: %[[p:.*]] = fir.load %[[ptup]] : !fir.llvm_ptr>>> @@ -271,7 +271,7 @@ subroutine test5 contains ! CHECK-LABEL: func private @_QFtest5Ptest5_inner( - ! CHECK-SAME:%[[tup:.*]]: !fir.ref>>>, !fir.ref>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { + ! CHECK-SAME:%[[tup:.*]]: !fir.ref>>>, !fir.ref>>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine test5_inner ! CHECK: %[[ptup:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref>>>, !fir.ref>>>>>, i32) -> !fir.llvm_ptr>>>> ! CHECK: %[[p:.*]] = fir.load %[[ptup]] : !fir.llvm_ptr>>>> @@ -309,7 +309,7 @@ subroutine test7(j, k) contains ! CHECK-LABEL: func private @_QFtest7Ptest7_inner( -! CHECK-SAME: %[[i:.*]]: !fir.ref{{.*}}, %[[tup:.*]]: !fir.ref>> {fir.host_assoc}) -> i32 attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[i:.*]]: !fir.ref{{.*}}, %[[tup:.*]]: !fir.ref>> {fir.host_assoc}) -> i32 attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { elemental integer function test7_inner(i) implicit none integer, intent(in) :: i @@ -330,7 +330,7 @@ subroutine issue990() call bar() contains ! CHECK-LABEL: func private @_QFissue990Pbar( -! CHECK-SAME: %[[tup:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[tup:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine bar() integer :: stmt_func, i stmt_func(i) = i + captured @@ -352,7 +352,7 @@ subroutine issue990b() call bar() contains ! CHECK-LABEL: func private @_QFissue990bPbar( -! CHECK-SAME: %[[tup:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[tup:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine bar() ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref>>, i32) -> !fir.llvm_ptr> ! CHECK: %[[addr:.*]] = fir.load %[[tupAddr]] : !fir.llvm_ptr> @@ -373,7 +373,7 @@ real function dummy_proc(x) call bar() contains ! CHECK-LABEL: func private @_QFtest8Pbar( -! CHECK-SAME: %[[tup:.*]]: !fir.ref ()>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[tup:.*]]: !fir.ref ()>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine bar() ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref ()>>>, i32) -> !fir.ref ()>> ! CHECK: %[[dummyProc:.*]] = fir.load %[[tupAddr]] : !fir.ref ()>> @@ -393,7 +393,7 @@ subroutine dummy_proc() call bar() contains ! CHECK-LABEL: func private @_QFtest9Pbar( -! CHECK-SAME: %[[tup:.*]]: !fir.ref ()>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[tup:.*]]: !fir.ref ()>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine bar() ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref ()>>>, i32) -> !fir.ref ()>> ! CHECK: %[[dummyProc:.*]] = fir.load %[[tupAddr]] : !fir.ref ()>> @@ -416,7 +416,7 @@ subroutine test10(i) call bar() contains ! CHECK-LABEL: func private @_QFtest10Pbar( -! CHECK-SAME: %[[tup:.*]]: !fir.ref>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[tup:.*]]: !fir.ref>>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { subroutine bar() ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref>>>>>, i32) -> !fir.llvm_ptr>>>> ! CHECK: fir.load %[[tupAddr]] : !fir.llvm_ptr>>>> @@ -435,7 +435,7 @@ subroutine bar() ! CHECK-LABEL: func private @_QFtest_proc_dummyPtest_proc_dummy_a( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref {fir.bindc_name = "j"}, -! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32 ! CHECK: %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref>>, i32) -> !fir.llvm_ptr> ! CHECK: %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.llvm_ptr> @@ -528,7 +528,7 @@ end subroutine test_proc_dummy_other ! CHECK-LABEL: func private @_QFtest_proc_dummy_charPgen_message( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>, ! CHECK-SAME: %[[VAL_1:.*]]: index, -! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : i32 ! CHECK-DAG: %[[VAL_4:.*]] = arith.constant 10 : index ! CHECK-DAG: %[[VAL_5:.*]] = arith.constant false diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90 index e031b4805dc5b1..70c1f768e389a9 100644 --- a/flang/test/Lower/polymorphic.f90 +++ b/flang/test/Lower/polymorphic.f90 @@ -520,7 +520,7 @@ subroutine internal end subroutine ! CHECK-LABEL: func.func private @_QMpolymorphic_testFhost_assocPinternal( -! CHECK-SAME: %[[TUPLE:.*]]: !fir.ref>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage} { +! CHECK-SAME: %[[TUPLE:.*]]: !fir.ref>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { ! CHECK: %[[POS_IN_TUPLE:.*]] = arith.constant 0 : i32 ! CHECK: %[[COORD_OF_CLASS:.*]] = fir.coordinate_of %[[TUPLE]], %[[POS_IN_TUPLE]] : (!fir.ref>>>, i32) -> !fir.ref>> ! CHECK: %[[CLASS:.*]] = fir.load %[[COORD_OF_CLASS]] : !fir.ref>> From b512df660ef136f8bbd0895bf862a827923a6714 Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Wed, 17 Apr 2024 11:50:37 +0200 Subject: [PATCH 227/300] [SPIR-V] Improve Tablegen instruction selection and account for a pointer size of the target (#88725) This PR resolves the issue that SPIR-V Backend uses the notion of a pointer size of the target, most notably, in legalizer code, but Tablegen instruction selection in SPIR-V Backend doesn't account for a pointer size of the target. See https://github.com/llvm/llvm-project/issues/88723 for a detailed description. There are 3 test cases attached to the PR that reproduced the issue, when dealing with spirv32-spirv64 differences, and are working correctly now with this PR. --- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 3 +- llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp | 6 +- llvm/lib/Target/SPIRV/SPIRVInstrInfo.td | 18 ++++-- .../Target/SPIRV/SPIRVInstructionSelector.cpp | 8 ++- llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp | 5 +- llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp | 62 ++++++++++++++----- .../Target/SPIRV/SPIRVRegisterBankInfo.cpp | 20 +----- llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td | 7 +-- llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td | 49 ++++++++------- .../CodeGen/SPIRV/instructions/select-phi.ll | 4 ++ .../SPIRV/instructions/select-ptr-load.ll | 25 ++++++++ .../test/CodeGen/SPIRV/instructions/select.ll | 3 + .../SPIRV/{select.ll => select-builtin.ll} | 2 + 13 files changed, 138 insertions(+), 74 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll rename llvm/test/CodeGen/SPIRV/{select.ll => select-builtin.ll} (67%) diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 05e41e06248e35..cebe230d3e8ce3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -653,7 +653,8 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( auto MRI = MIRBuilder.getMRI(); assert(MRI->getType(ResVReg).isPointer() && "Pointer type is expected"); if (Reg != ResVReg) { - LLT RegLLTy = LLT::pointer(MRI->getType(ResVReg).getAddressSpace(), 32); + LLT RegLLTy = + LLT::pointer(MRI->getType(ResVReg).getAddressSpace(), getPointerSize()); MRI->setType(Reg, RegLLTy); assignSPIRVTypeToVReg(BaseType, Reg, MIRBuilder.getMF()); } else { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp index aacfecc1e313f0..af98f2f8804593 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp @@ -247,8 +247,10 @@ void SPIRVInstrInfo::copyPhysReg(MachineBasicBlock &MBB, bool SPIRVInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_fID || - MI.getOpcode() == SPIRV::GET_pID || MI.getOpcode() == SPIRV::GET_vfID || - MI.getOpcode() == SPIRV::GET_vID || MI.getOpcode() == SPIRV::GET_vpID) { + MI.getOpcode() == SPIRV::GET_pID32 || + MI.getOpcode() == SPIRV::GET_pID64 || MI.getOpcode() == SPIRV::GET_vfID || + MI.getOpcode() == SPIRV::GET_vID || MI.getOpcode() == SPIRV::GET_vpID32 || + MI.getOpcode() == SPIRV::GET_vpID64) { auto &MRI = MI.getMF()->getRegInfo(); MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); MI.eraseFromParent(); diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index a3f981457c8daa..151d0ec1fe5690 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -19,10 +19,12 @@ let isCodeGenOnly=1 in { def DECL_TYPE: Pseudo<(outs ANYID:$dst_id), (ins ANYID:$src_id, TYPE:$src_ty)>; def GET_ID: Pseudo<(outs ID:$dst_id), (ins ANYID:$src)>; def GET_fID: Pseudo<(outs fID:$dst_id), (ins ANYID:$src)>; - def GET_pID: Pseudo<(outs pID:$dst_id), (ins ANYID:$src)>; + def GET_pID32: Pseudo<(outs pID32:$dst_id), (ins ANYID:$src)>; + def GET_pID64: Pseudo<(outs pID64:$dst_id), (ins ANYID:$src)>; def GET_vID: Pseudo<(outs vID:$dst_id), (ins ANYID:$src)>; def GET_vfID: Pseudo<(outs vfID:$dst_id), (ins ANYID:$src)>; - def GET_vpID: Pseudo<(outs vpID:$dst_id), (ins ANYID:$src)>; + def GET_vpID32: Pseudo<(outs vpID32:$dst_id), (ins ANYID:$src)>; + def GET_vpID64: Pseudo<(outs vpID64:$dst_id), (ins ANYID:$src)>; } def SPVTypeBin : SDTypeProfile<1, 2, []>; @@ -66,8 +68,10 @@ multiclass TernOpTypedGen opCode, SDNode node, bit genP = def SIVCond: TernOpTyped; } if genP then { - def SPSCond: TernOpTyped; - def SPVCond: TernOpTyped; + def SPSCond32: TernOpTyped; + def SPVCond32: TernOpTyped; + def SPSCond64: TernOpTyped; + def SPVCond64: TernOpTyped; } if genV then { if genF then { @@ -79,8 +83,10 @@ multiclass TernOpTypedGen opCode, SDNode node, bit genP = def VIVCond: TernOpTyped; } if genP then { - def VPSCond: TernOpTyped; - def VPVCond: TernOpTyped; + def VPSCond32: TernOpTyped; + def VPVCond32: TernOpTyped; + def VPSCond64: TernOpTyped; + def VPVCond64: TernOpTyped; } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 200fe38298c021..72e5a7bcac9834 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -290,14 +290,18 @@ bool SPIRVInstructionSelector::select(MachineInstr &I) { // If it's not a GMIR instruction, we've selected it already. if (!isPreISelGenericOpcode(Opcode)) { if (Opcode == SPIRV::ASSIGN_TYPE) { // These pseudos aren't needed any more. - auto *Def = MRI->getVRegDef(I.getOperand(1).getReg()); + Register DstReg = I.getOperand(0).getReg(); + Register SrcReg = I.getOperand(1).getReg(); + auto *Def = MRI->getVRegDef(SrcReg); if (isTypeFoldingSupported(Def->getOpcode())) { + if (MRI->getType(DstReg).isPointer()) + MRI->setType(DstReg, LLT::scalar(32)); bool Res = selectImpl(I, *CoverageInfo); assert(Res || Def->getOpcode() == TargetOpcode::G_CONSTANT); if (Res) return Res; } - MRI->replaceRegWith(I.getOperand(1).getReg(), I.getOperand(0).getReg()); + MRI->replaceRegWith(SrcReg, DstReg); I.removeFromParent(); return true; } else if (I.getNumDefs() == 1) { diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp index f069a92ac68683..d652b5de608086 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp @@ -55,8 +55,9 @@ extern void processInstr(MachineInstr &MI, MachineIRBuilder &MIB, static bool isMetaInstrGET(unsigned Opcode) { return Opcode == SPIRV::GET_ID || Opcode == SPIRV::GET_fID || - Opcode == SPIRV::GET_pID || Opcode == SPIRV::GET_vID || - Opcode == SPIRV::GET_vfID || Opcode == SPIRV::GET_vpID; + Opcode == SPIRV::GET_pID32 || Opcode == SPIRV::GET_pID64 || + Opcode == SPIRV::GET_vID || Opcode == SPIRV::GET_vfID || + Opcode == SPIRV::GET_vpID32 || Opcode == SPIRV::GET_vpID64; } static bool mayBeInserted(unsigned Opcode) { diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 2c964595fc39e8..ed03154d4c8dd6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -223,11 +223,12 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR, } static std::pair -createNewIdReg(Register ValReg, unsigned Opcode, MachineRegisterInfo &MRI, +createNewIdReg(SPIRVType *SpvType, Register SrcReg, MachineRegisterInfo &MRI, const SPIRVGlobalRegistry &GR) { - LLT NewT = LLT::scalar(32); - SPIRVType *SpvType = GR.getSPIRVTypeForVReg(ValReg); + if (!SpvType) + SpvType = GR.getSPIRVTypeForVReg(SrcReg); assert(SpvType && "VReg is expected to have SPIRV type"); + LLT NewT = LLT::scalar(32); bool IsFloat = SpvType->getOpcode() == SPIRV::OpTypeFloat; bool IsVectorFloat = SpvType->getOpcode() == SPIRV::OpTypeVector && @@ -236,14 +237,38 @@ createNewIdReg(Register ValReg, unsigned Opcode, MachineRegisterInfo &MRI, IsFloat |= IsVectorFloat; auto GetIdOp = IsFloat ? SPIRV::GET_fID : SPIRV::GET_ID; auto DstClass = IsFloat ? &SPIRV::fIDRegClass : &SPIRV::IDRegClass; - if (MRI.getType(ValReg).isPointer()) { - NewT = LLT::pointer(0, 32); - GetIdOp = SPIRV::GET_pID; - DstClass = &SPIRV::pIDRegClass; - } else if (MRI.getType(ValReg).isVector()) { + if (MRI.getType(SrcReg).isPointer()) { + unsigned PtrSz = GR.getPointerSize(); + NewT = LLT::pointer(0, PtrSz); + bool IsVec = MRI.getType(SrcReg).isVector(); + if (IsVec) + NewT = LLT::fixed_vector(2, NewT); + if (PtrSz == 64) { + if (IsVec) { + GetIdOp = SPIRV::GET_vpID64; + DstClass = &SPIRV::vpID64RegClass; + } else { + GetIdOp = SPIRV::GET_pID64; + DstClass = &SPIRV::pID64RegClass; + } + } else { + if (IsVec) { + GetIdOp = SPIRV::GET_vpID32; + DstClass = &SPIRV::vpID32RegClass; + } else { + GetIdOp = SPIRV::GET_pID32; + DstClass = &SPIRV::pID32RegClass; + } + } + } else if (MRI.getType(SrcReg).isVector()) { NewT = LLT::fixed_vector(2, NewT); - GetIdOp = IsFloat ? SPIRV::GET_vfID : SPIRV::GET_vID; - DstClass = IsFloat ? &SPIRV::vfIDRegClass : &SPIRV::vIDRegClass; + if (IsFloat) { + GetIdOp = SPIRV::GET_vfID; + DstClass = &SPIRV::vfIDRegClass; + } else { + GetIdOp = SPIRV::GET_vID; + DstClass = &SPIRV::vIDRegClass; + } } Register IdReg = MRI.createGenericVirtualRegister(NewT); MRI.setRegClass(IdReg, DstClass); @@ -264,6 +289,7 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy, MIB.setInsertPt(*Def->getParent(), (Def->getNextNode() ? Def->getNextNode()->getIterator() : Def->getParent()->end())); + SpirvTy = SpirvTy ? SpirvTy : GR->getOrCreateSPIRVType(Ty, MIB); Register NewReg = MRI.createGenericVirtualRegister(MRI.getType(Reg)); if (auto *RC = MRI.getRegClassOrNull(Reg)) { MRI.setRegClass(NewReg, RC); @@ -271,7 +297,6 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy, MRI.setRegClass(NewReg, &SPIRV::IDRegClass); MRI.setRegClass(Reg, &SPIRV::IDRegClass); } - SpirvTy = SpirvTy ? SpirvTy : GR->getOrCreateSPIRVType(Ty, MIB); GR->assignSPIRVTypeToVReg(SpirvTy, Reg, MIB.getMF()); // This is to make it convenient for Legalizer to get the SPIRVType // when processing the actual MI (i.e. not pseudo one). @@ -290,11 +315,11 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy, void processInstr(MachineInstr &MI, MachineIRBuilder &MIB, MachineRegisterInfo &MRI, SPIRVGlobalRegistry *GR) { - unsigned Opc = MI.getOpcode(); assert(MI.getNumDefs() > 0 && MRI.hasOneUse(MI.getOperand(0).getReg())); MachineInstr &AssignTypeInst = *(MRI.use_instr_begin(MI.getOperand(0).getReg())); - auto NewReg = createNewIdReg(MI.getOperand(0).getReg(), Opc, MRI, *GR).first; + auto NewReg = + createNewIdReg(nullptr, MI.getOperand(0).getReg(), MRI, *GR).first; AssignTypeInst.getOperand(1).setReg(NewReg); MI.getOperand(0).setReg(NewReg); MIB.setInsertPt(*MI.getParent(), @@ -303,7 +328,7 @@ void processInstr(MachineInstr &MI, MachineIRBuilder &MIB, for (auto &Op : MI.operands()) { if (!Op.isReg() || Op.isDef()) continue; - auto IdOpInfo = createNewIdReg(Op.getReg(), Opc, MRI, *GR); + auto IdOpInfo = createNewIdReg(nullptr, Op.getReg(), MRI, *GR); MIB.buildInstr(IdOpInfo.second).addDef(IdOpInfo.first).addUse(Op.getReg()); Op.setReg(IdOpInfo.first); } @@ -419,6 +444,7 @@ static void processInstrsWithTypeFolding(MachineFunction &MF, processInstr(MI, MIB, MRI, GR); } } + for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { // We need to rewrite dst types for ASSIGN_TYPE instrs to be able @@ -431,16 +457,18 @@ static void processInstrsWithTypeFolding(MachineFunction &MF, if (!isTypeFoldingSupported(Opcode)) continue; Register DstReg = MI.getOperand(0).getReg(); - if (MRI.getType(DstReg).isVector()) + bool IsDstPtr = MRI.getType(DstReg).isPointer(); + if (IsDstPtr || MRI.getType(DstReg).isVector()) MRI.setRegClass(DstReg, &SPIRV::IDRegClass); // Don't need to reset type of register holding constant and used in - // G_ADDRSPACE_CAST, since it braaks legalizer. + // G_ADDRSPACE_CAST, since it breaks legalizer. if (Opcode == TargetOpcode::G_CONSTANT && MRI.hasOneUse(DstReg)) { MachineInstr &UseMI = *MRI.use_instr_begin(DstReg); if (UseMI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST) continue; } - MRI.setType(DstReg, LLT::scalar(32)); + MRI.setType(DstReg, IsDstPtr ? LLT::pointer(0, GR->getPointerSize()) + : LLT::scalar(32)); } } } diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp index 5983c9229cb3c2..ecd99f1840d7e0 100644 --- a/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp @@ -27,23 +27,7 @@ using namespace llvm; const RegisterBank & SPIRVRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, LLT Ty) const { - switch (RC.getID()) { - case SPIRV::TYPERegClassID: + if (RC.getID() == SPIRV::TYPERegClassID) return SPIRV::TYPERegBank; - case SPIRV::pIDRegClassID: - case SPIRV::IDRegClassID: - return SPIRV::IDRegBank; - case SPIRV::fIDRegClassID: - return SPIRV::fIDRegBank; - case SPIRV::vIDRegClassID: - return SPIRV::vIDRegBank; - case SPIRV::vfIDRegClassID: - return SPIRV::vfIDRegBank; - case SPIRV::vpIDRegClassID: - return SPIRV::vpIDRegBank; - case SPIRV::ANYIDRegClassID: - case SPIRV::ANYRegClassID: - return SPIRV::IDRegBank; - } - llvm_unreachable("Unknown register class"); + return SPIRV::IDRegBank; } diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td b/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td index c7f1e172f3d4f1..dea2ef402d3d97 100644 --- a/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td +++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td @@ -8,9 +8,6 @@ // Although RegisterBankSelection is disabled we need to distinct the banks // as InstructionSelector RegClass checking code relies on them -def IDRegBank : RegisterBank<"IDBank", [ID]>; -def fIDRegBank : RegisterBank<"fIDBank", [fID]>; -def vIDRegBank : RegisterBank<"vIDBank", [vID]>; -def vfIDRegBank : RegisterBank<"vfIDBank", [vfID]>; -def vpIDRegBank : RegisterBank<"vpIDBank", [vpID]>; + def TYPERegBank : RegisterBank<"TYPEBank", [TYPE]>; +def IDRegBank : RegisterBank<"IDBank", [ID, fID, pID32, pID64, vID, vfID, vpID32, vpID64]>; diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td index 6d2bfb91a97f12..9231d22e8d8362 100644 --- a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td @@ -11,39 +11,46 @@ //===----------------------------------------------------------------------===// let Namespace = "SPIRV" in { - def p0 : PtrValueType ; - - class P0Vec - : PtrValueType { - let nElem = 2; - let ElementType = p0; - let isInteger = false; - let isFP = false; - let isVector = true; + // Pointer types for patterns with the GlobalISelEmitter + def p32 : PtrValueType ; + def p64 : PtrValueType ; + + class VTPtrVec + : VTVec, ptr.Value> { + int isPointer = true; } - def v2p0 : P0Vec; - // All registers are for 32-bit identifiers, so have a single dummy register + def v2p32 : VTPtrVec<2, p32>; + def v2p64 : VTPtrVec<2, p64>; - // Class for registers that are the result of OpTypeXXX instructions + // Class for type registers def TYPE0 : Register<"TYPE0">; def TYPE : RegisterClass<"SPIRV", [i32], 32, (add TYPE0)>; - // Class for every other non-type ID + // Class for non-type registers def ID0 : Register<"ID0">; - def ID : RegisterClass<"SPIRV", [i32], 32, (add ID0)>; def fID0 : Register<"fID0">; - def fID : RegisterClass<"SPIRV", [f32], 32, (add fID0)>; - def pID0 : Register<"pID0">; - def pID : RegisterClass<"SPIRV", [p0], 32, (add pID0)>; + def pID320 : Register<"pID320">; + def pID640 : Register<"pID640">; def vID0 : Register<"vID0">; - def vID : RegisterClass<"SPIRV", [v2i32], 32, (add vID0)>; def vfID0 : Register<"vfID0">; + def vpID320 : Register<"vpID320">; + def vpID640 : Register<"vpID640">; + + def ID : RegisterClass<"SPIRV", [i32], 32, (add ID0)>; + def fID : RegisterClass<"SPIRV", [f32], 32, (add fID0)>; + def pID32 : RegisterClass<"SPIRV", [p32], 32, (add pID320)>; + def pID64 : RegisterClass<"SPIRV", [p64], 32, (add pID640)>; + def vID : RegisterClass<"SPIRV", [v2i32], 32, (add vID0)>; def vfID : RegisterClass<"SPIRV", [v2f32], 32, (add vfID0)>; - def vpID0 : Register<"vpID0">; - def vpID : RegisterClass<"SPIRV", [v2p0], 32, (add vpID0)>; + def vpID32 : RegisterClass<"SPIRV", [v2p32], 32, (add vpID320)>; + def vpID64 : RegisterClass<"SPIRV", [v2p64], 32, (add vpID640)>; - def ANYID : RegisterClass<"SPIRV", [i32, f32, p0, v2i32, v2f32], 32, (add ID, fID, pID, vID, vfID)>; + def ANYID : RegisterClass< + "SPIRV", + [i32, f32, p32, p64, v2i32, v2f32, v2p32, v2p64], + 32, + (add ID0, fID0, pID320, pID640, vID0, vfID0, vpID320, vpID640)>; // A few instructions like OpName can take ids from both type and non-type // instructions, so we need a super-class to allow for both to count as valid diff --git a/llvm/test/CodeGen/SPIRV/instructions/select-phi.ll b/llvm/test/CodeGen/SPIRV/instructions/select-phi.ll index afc75c616f023b..3828fe89e60aec 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/select-phi.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/select-phi.ll @@ -1,6 +1,10 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s + ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --translator-compatibility-mode %s -o - -filetype=obj | spirv-val %} ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --translator-compatibility-mode %s -o - -filetype=obj | spirv-val %} +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0 ; CHECK-DAG: %[[Long:.*]] = OpTypeInt 32 0 diff --git a/llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll b/llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll new file mode 100644 index 00000000000000..0ff28952f8081a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll @@ -0,0 +1,25 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV-DAG: %[[Float:.*]] = OpTypeFloat 32 +; CHECK-SPIRV-DAG: %[[FloatPtr:.*]] = OpTypePointer Function %[[Float]] +; CHECK-SPIRV: OpInBoundsPtrAccessChain %[[FloatPtr]] +; CHECK-SPIRV: OpInBoundsPtrAccessChain %[[FloatPtr]] +; CHECK-SPIRV: OpSelect %[[FloatPtr]] +; CHECK-SPIRV: OpLoad %[[Float]] + +%struct = type { [3 x float] } + +define spir_kernel void @bar(i1 %sw) { +entry: + %var1 = alloca %struct + %var2 = alloca %struct + %elem1 = getelementptr inbounds [3 x float], ptr %var1, i64 0, i64 0 + %elem2 = getelementptr inbounds [3 x float], ptr %var2, i64 0, i64 1 + %elem = select i1 %sw, ptr %elem1, ptr %elem2 + %res = load float, ptr %elem + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/instructions/select.ll b/llvm/test/CodeGen/SPIRV/instructions/select.ll index c4176b17abb449..9234b97157d9d8 100644 --- a/llvm/test/CodeGen/SPIRV/instructions/select.ll +++ b/llvm/test/CodeGen/SPIRV/instructions/select.ll @@ -1,6 +1,9 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + ; CHECK-DAG: OpName [[SCALARi32:%.+]] "select_i32" ; CHECK-DAG: OpName [[SCALARPTR:%.+]] "select_ptr" ; CHECK-DAG: OpName [[VEC2i32:%.+]] "select_i32v2" diff --git a/llvm/test/CodeGen/SPIRV/select.ll b/llvm/test/CodeGen/SPIRV/select-builtin.ll similarity index 67% rename from llvm/test/CodeGen/SPIRV/select.ll rename to llvm/test/CodeGen/SPIRV/select-builtin.ll index b34e91be1dbcda..6717970d160fcf 100644 --- a/llvm/test/CodeGen/SPIRV/select.ll +++ b/llvm/test/CodeGen/SPIRV/select-builtin.ll @@ -1,4 +1,6 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: OpSelect From 42d801d4e42ff8c47c3a24d562774851e3a424f5 Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Wed, 17 Apr 2024 11:50:55 +0200 Subject: [PATCH 228/300] [SPIR-V] Account for zext in a llvm intrinsic call (#88903) This PR addresses an issue that may arise when an integer argument size differs from a machine word size for the target in a call to llvm intrinsic. The following example demonstrates the issue: ``` @__const.test.arr = private unnamed_addr addrspace(2) constant [3 x i32] [i32 1, i32 2, i32 3] define spir_func void @test() { entry: %arr = alloca [3 x i32], align 4 %dest = bitcast ptr %arr to ptr call void @llvm.memcpy.p0.p2.i32(ptr align 4 %dest, ptr addrspace(2) align 4 @__const.test.arr, i32 1024, i1 false) ret void } declare void @llvm.memcpy.p0.p2.i32(ptr nocapture writeonly, ptr addrspace(2) nocapture readonly, i32, i1) ``` Depending on the target this code may work or may fail without this PR due to the fact that IR Translation step introduces additional `zext` when type of the 3rd argument of `@llvm.memcpy.p0.p2.i32` differs from machine word. This PR addresses the issue by adding type deduction for a newly inserted G_ZEXT generic opcode. --- llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp | 25 +++++++++ .../CodeGen/SPIRV/transcoding/memcpy-zext.ll | 43 ++++++++++++++++ .../spirv-private-array-initialization.ll | 51 +++++++++++-------- 3 files changed, 99 insertions(+), 20 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/transcoding/memcpy-zext.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index ed03154d4c8dd6..d16f6d5bf67ef4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -171,6 +171,12 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, // %1 = G_GLOBAL_VALUE // %2 = COPY %1 // %3 = G_ADDRSPACE_CAST %2 +// +// or +// +// %1 = G_ZEXT %2 +// G_MEMCPY ... %2 ... +// // New registers have no SPIRVType and no register class info. // // Set SPIRVType for GV, propagate it from GV to other instructions, @@ -200,6 +206,24 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR, SpirvTy = GR->getOrCreateSPIRVType(Ty, MIB); break; } + case TargetOpcode::G_ZEXT: { + if (MI->getOperand(1).isReg()) { + if (MachineInstr *DefInstr = + MRI.getVRegDef(MI->getOperand(1).getReg())) { + if (SPIRVType *Def = propagateSPIRVType(DefInstr, GR, MRI, MIB)) { + unsigned CurrentBW = GR->getScalarOrVectorBitWidth(Def); + unsigned ExpectedBW = + std::max(MRI.getType(Reg).getScalarSizeInBits(), CurrentBW); + unsigned NumElements = GR->getScalarOrVectorComponentCount(Def); + SpirvTy = GR->getOrCreateSPIRVIntegerType(ExpectedBW, MIB); + if (NumElements > 1) + SpirvTy = + GR->getOrCreateSPIRVVectorType(SpirvTy, NumElements, MIB); + } + } + } + break; + } case TargetOpcode::G_TRUNC: case TargetOpcode::G_ADDRSPACE_CAST: case TargetOpcode::G_PTR_ADD: @@ -415,6 +439,7 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, } insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI); } else if (MI.getOpcode() == TargetOpcode::G_TRUNC || + MI.getOpcode() == TargetOpcode::G_ZEXT || MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE || MI.getOpcode() == TargetOpcode::COPY || MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST) { diff --git a/llvm/test/CodeGen/SPIRV/transcoding/memcpy-zext.ll b/llvm/test/CodeGen/SPIRV/transcoding/memcpy-zext.ll new file mode 100644 index 00000000000000..ea0197548a8154 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/memcpy-zext.ll @@ -0,0 +1,43 @@ +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-32 +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-64 +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-64-DAG: %[[#i64:]] = OpTypeInt 64 0 + +; CHECK-DAG: %[[#i8:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#i32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#one:]] = OpConstant %[[#i32]] 1 +; CHECK-DAG: %[[#two:]] = OpConstant %[[#i32]] 2 +; CHECK-DAG: %[[#three:]] = OpConstant %[[#i32]] 3 +; CHECK-DAG: %[[#i32x3:]] = OpTypeArray %[[#i32]] %[[#three]] +; CHECK-DAG: %[[#test_arr_init:]] = OpConstantComposite %[[#i32x3]] %[[#one]] %[[#two]] %[[#three]] +; CHECK-DAG: %[[#szconst1024:]] = OpConstant %[[#i32]] 1024 +; CHECK-DAG: %[[#szconst42:]] = OpConstant %[[#i8]] 42 +; CHECK-DAG: %[[#const_i32x3_ptr:]] = OpTypePointer UniformConstant %[[#i32x3]] +; CHECK-DAG: %[[#test_arr:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]] +; CHECK-DAG: %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]] +; CHECK: %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function + +; CHECK-32: OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#szconst1024]] +; CHECK-64: %[[#szconstext1024:]] = OpUConvert %[[#i64:]] %[[#szconst1024:]] +; CHECK-64: OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#szconstext1024]] + +; CHECK-32: %[[#szconstext42:]] = OpUConvert %[[#i32:]] %[[#szconst42:]] +; CHECK-32: OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#szconstext42]] +; CHECK-64: %[[#szconstext42:]] = OpUConvert %[[#i64:]] %[[#szconst42:]] +; CHECK-64: OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#szconstext42]] + +@__const.test.arr = private unnamed_addr addrspace(2) constant [3 x i32] [i32 1, i32 2, i32 3] + +define spir_func void @test() { +entry: + %arr = alloca [3 x i32], align 4 + %dest = bitcast ptr %arr to ptr + call void @llvm.memcpy.p0.p2.i32(ptr align 4 %dest, ptr addrspace(2) align 4 @__const.test.arr, i32 1024, i1 false) + call void @llvm.memcpy.p0.p2.i8(ptr align 4 %dest, ptr addrspace(2) align 4 @__const.test.arr, i8 42, i1 false) + ret void +} + +declare void @llvm.memcpy.p0.p2.i32(ptr nocapture writeonly, ptr addrspace(2) nocapture readonly, i32, i1) +declare void @llvm.memcpy.p0.p2.i8(ptr nocapture writeonly, ptr addrspace(2) nocapture readonly, i8, i1) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll index e0172ec3c1bdb7..04fb39118034c8 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll @@ -1,23 +1,34 @@ -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV -; -; CHECK-SPIRV-DAG: %[[#i32:]] = OpTypeInt 32 0 -; CHECK-SPIRV-DAG: %[[#one:]] = OpConstant %[[#i32]] 1 -; CHECK-SPIRV-DAG: %[[#two:]] = OpConstant %[[#i32]] 2 -; CHECK-SPIRV-DAG: %[[#three:]] = OpConstant %[[#i32]] 3 -; CHECK-SPIRV-DAG: %[[#i32x3:]] = OpTypeArray %[[#i32]] %[[#three]] -; CHECK-SPIRV-DAG: %[[#test_arr_init:]] = OpConstantComposite %[[#i32x3]] %[[#one]] %[[#two]] %[[#three]] -; CHECK-SPIRV-DAG: %[[#twelve:]] = OpConstant %[[#i32]] 12 -; CHECK-SPIRV-DAG: %[[#const_i32x3_ptr:]] = OpTypePointer UniformConstant %[[#i32x3]] - -; CHECK-SPIRV: %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]] -; CHECK-SPIRV: %[[#test_arr:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]] - -; CHECK-SPIRV-DAG: %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]] - -; CHECK-SPIRV: %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function -; CHECK-SPIRV: %[[#arr2:]] = OpVariable %[[#i32x3_ptr]] Function -; CHECK-SPIRV: OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelve]] Aligned 4 -; CHECK-SPIRV: OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelve]] Aligned 4 +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-32 +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-64 +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV-64-DAG: %[[#i64:]] = OpTypeInt 64 0 + +; CHECK-SPIRV-DAG: %[[#i32:]] = OpTypeInt 32 0 +; CHECK-SPIRV-DAG: %[[#one:]] = OpConstant %[[#i32]] 1 +; CHECK-SPIRV-DAG: %[[#two:]] = OpConstant %[[#i32]] 2 +; CHECK-SPIRV-DAG: %[[#three:]] = OpConstant %[[#i32]] 3 +; CHECK-SPIRV-DAG: %[[#i32x3:]] = OpTypeArray %[[#i32]] %[[#three]] +; CHECK-SPIRV-DAG: %[[#test_arr_init:]] = OpConstantComposite %[[#i32x3]] %[[#one]] %[[#two]] %[[#three]] +; CHECK-SPIRV-DAG: %[[#twelve:]] = OpConstant %[[#i32]] 12 +; CHECK-SPIRV-DAG: %[[#const_i32x3_ptr:]] = OpTypePointer UniformConstant %[[#i32x3]] + +; CHECK-SPIRV: %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]] +; CHECK-SPIRV: %[[#test_arr:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]] + +; CHECK-SPIRV-DAG: %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]] + +; CHECK-SPIRV: %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function +; CHECK-SPIRV: %[[#arr2:]] = OpVariable %[[#i32x3_ptr]] Function + +; CHECK-SPIRV-32: OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelve]] Aligned 4 +; CHECK-SPIRV-32: OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelve]] Aligned 4 + +; CHECK-SPIRV-64: %[[#twelvezext1:]] = OpUConvert %[[#i64:]] %[[#twelve:]] +; CHECK-SPIRV-64: OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelvezext1]] Aligned 4 +; CHECK-SPIRV-64: %[[#twelvezext2:]] = OpUConvert %[[#i64:]] %[[#twelve:]] +; CHECK-SPIRV-64: OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelvezext2]] Aligned 4 @__const.test.arr = private unnamed_addr addrspace(2) constant [3 x i32] [i32 1, i32 2, i32 3], align 4 From fa61f062a515be92a98cac64a9193498918c1225 Mon Sep 17 00:00:00 2001 From: harishch4 Date: Wed, 17 Apr 2024 15:21:45 +0530 Subject: [PATCH 229/300] Fix threadprivate variable scope inside BLOCK construct. (#88921) When a local variable inside a BLOCK construct is used as threadprivate variable, llvm-flang throws below error: > error: The THREADPRIVATE directive and the common block or variable in it must appear in the same declaration section of a scoping unit --- flang/lib/Semantics/check-omp-structure.cpp | 2 +- flang/test/Lower/OpenMP/threadprivate-hlfir.f90 | 1 + flang/test/Semantics/OpenMP/threadprivate07.f90 | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 flang/test/Semantics/OpenMP/threadprivate07.f90 diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index e85d8d1f7ab533..bafa242a79302a 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -1048,7 +1048,7 @@ void OmpStructureChecker::CheckThreadprivateOrDeclareTargetVar( name->symbol->GetUltimate().owner(); if (!curScope.IsTopLevel()) { const semantics::Scope &declScope = - GetProgramUnitContaining(curScope); + GetProgramUnitOrBlockConstructContaining(curScope); const semantics::Symbol *sym{ declScope.parent().FindSymbol(name->symbol->name())}; if (sym && diff --git a/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 b/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 index d39ae1e7011838..7d02987c5eadee 100644 --- a/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 +++ b/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 @@ -24,3 +24,4 @@ subroutine sub() print *, a !$omp end parallel end subroutine + diff --git a/flang/test/Semantics/OpenMP/threadprivate07.f90 b/flang/test/Semantics/OpenMP/threadprivate07.f90 new file mode 100644 index 00000000000000..c9a006ca0e0839 --- /dev/null +++ b/flang/test/Semantics/OpenMP/threadprivate07.f90 @@ -0,0 +1,15 @@ +! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp + +! Check Threadprivate Directive with local variable of a BLOCK construct. + +program main + call sub1() + print *, 'pass' +end program main + +subroutine sub1() + BLOCK + integer, save :: a + !$omp threadprivate(a) + END BLOCK +end subroutine From cbe148b730a04fc95eda9a43903f0af487884a96 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Wed, 17 Apr 2024 17:59:52 +0800 Subject: [PATCH 230/300] [LV][NFC] Remove the declaration of function `fixReduction`. (#88491) --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 44885a95bd1020..20059f9d62d552 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -616,9 +616,6 @@ class InnerLoopVectorizer { void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State); - /// Create code for the loop exit value of the reduction. - void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State); - /// Iteratively sink the scalarized operands of a predicated instruction into /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); From a9bafe91dd088c5fa6f074c14dd3a1af25f00457 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 17 Apr 2024 11:00:58 +0100 Subject: [PATCH 231/300] [VPlan] Split VPWidenMemoryInstructionRecipe (NFCI). (#87411) This patch introduces a new VPWidenMemoryRecipe base class and distinct sub-classes to model loads and stores. This is a first step in an effort to simplify and modularize code generation for widened loads and stores and enable adding further more specialized memory recipes. PR: https://github.com/llvm/llvm-project/pull/87411 --- .../Transforms/Vectorize/LoopVectorize.cpp | 191 +++++++++--------- .../Transforms/Vectorize/VPRecipeBuilder.h | 6 +- llvm/lib/Transforms/Vectorize/VPlan.h | 161 +++++++++------ .../Transforms/Vectorize/VPlanAnalysis.cpp | 9 +- llvm/lib/Transforms/Vectorize/VPlanAnalysis.h | 4 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 44 ++-- .../Transforms/Vectorize/VPlanTransforms.cpp | 23 +-- llvm/lib/Transforms/Vectorize/VPlanValue.h | 5 +- .../Transforms/Vectorize/VPlanVerifier.cpp | 2 +- .../AArch64/vector-reverse-mask4.ll | 2 +- .../LoopVectorize/X86/masked_load_store.ll | 12 +- .../Transforms/Vectorize/VPlanHCFGTest.cpp | 4 +- .../Transforms/Vectorize/VPlanTest.cpp | 9 +- 13 files changed, 249 insertions(+), 223 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 20059f9d62d552..a8272f45025358 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -545,11 +545,6 @@ class InnerLoopVectorizer { // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } - /// A type for vectorized values in the new loop. Each value from the - /// original loop, when vectorized, is represented by UF vector values in the - /// new unrolled loop, where UF is the unroll factor. - using VectorParts = SmallVector; - /// A helper function to scalarize a single Instruction in the innermost loop. /// Generates a sequence of scalar instances for each lane between \p MinLane /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, @@ -8086,7 +8081,7 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { BlockMaskCache[BB] = BlockMask; } -VPWidenMemoryInstructionRecipe * +VPWidenMemoryRecipe * VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, VFRange &Range) { assert((isa(I) || isa(I)) && @@ -8131,12 +8126,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, Ptr = VectorPtr; } if (LoadInst *Load = dyn_cast(I)) - return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive, - Reverse, I->getDebugLoc()); + return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, + I->getDebugLoc()); StoreInst *Store = cast(I); - return new VPWidenMemoryInstructionRecipe( - *Store, Ptr, Operands[0], Mask, Consecutive, Reverse, I->getDebugLoc()); + return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, + Reverse, I->getDebugLoc()); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also @@ -8775,13 +8770,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { // for this VPlan, replace the Recipes widening its memory instructions with a // single VPInterleaveRecipe at its insertion point. for (const auto *IG : InterleaveGroups) { - auto *Recipe = cast( - RecipeBuilder.getRecipe(IG->getInsertPos())); + auto *Recipe = + cast(RecipeBuilder.getRecipe(IG->getInsertPos())); SmallVector StoredValues; for (unsigned i = 0; i < IG->getFactor(); ++i) if (auto *SI = dyn_cast_or_null(IG->getMember(i))) { - auto *StoreR = - cast(RecipeBuilder.getRecipe(SI)); + auto *StoreR = cast(RecipeBuilder.getRecipe(SI)); StoredValues.push_back(StoreR->getStoredValue()); } @@ -9368,92 +9362,27 @@ static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, return Call; } -void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; - - // Attempt to issue a wide load. - LoadInst *LI = dyn_cast(&Ingredient); - StoreInst *SI = dyn_cast(&Ingredient); - - assert((LI || SI) && "Invalid Load/Store instruction"); - assert((!SI || StoredValue) && "No stored value provided for widened store"); - assert((!LI || !StoredValue) && "Stored value provided for widened load"); +void VPWidenLoadRecipe::execute(VPTransformState &State) { + auto *LI = cast(&Ingredient); Type *ScalarDataTy = getLoadStoreType(&Ingredient); - auto *DataTy = VectorType::get(ScalarDataTy, State.VF); const Align Alignment = getLoadStoreAlignment(&Ingredient); - bool CreateGatherScatter = !isConsecutive(); + bool CreateGather = !isConsecutive(); auto &Builder = State.Builder; - InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); - bool isMaskRequired = getMask(); - if (isMaskRequired) { - // Mask reversal is only needed for non-all-one (null) masks, as reverse of - // a null all-one mask is a null mask. - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *Mask = State.get(getMask(), Part); + State.setDebugLocFrom(getDebugLoc()); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *NewLI; + Value *Mask = nullptr; + if (auto *VPMask = getMask()) { + // Mask reversal is only needed for non-all-one (null) masks, as reverse + // of a null all-one mask is a null mask. + Mask = State.get(VPMask, Part); if (isReverse()) Mask = Builder.CreateVectorReverse(Mask, "reverse"); - BlockInMaskParts[Part] = Mask; - } - } - - // Handle Stores: - if (SI) { - State.setDebugLocFrom(getDebugLoc()); - - for (unsigned Part = 0; Part < State.UF; ++Part) { - Instruction *NewSI = nullptr; - Value *StoredVal = State.get(StoredValue, Part); - // TODO: split this into several classes for better design. - if (State.EVL) { - assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " - "explicit vector length."); - assert(cast(State.EVL)->getOpcode() == - VPInstruction::ExplicitVectorLength && - "EVL must be VPInstruction::ExplicitVectorLength."); - Value *EVL = State.get(State.EVL, VPIteration(0, 0)); - // If EVL is not nullptr, then EVL must be a valid value set during plan - // creation, possibly default value = whole vector register length. EVL - // is created only if TTI prefers predicated vectorization, thus if EVL - // is not nullptr it also implies preference for predicated - // vectorization. - // FIXME: Support reverse store after vp_reverse is added. - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - NewSI = lowerStoreUsingVectorIntrinsics( - Builder, State.get(getAddr(), Part, !CreateGatherScatter), - StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment); - } else if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; - Value *VectorGep = State.get(getAddr(), Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); - } else { - if (isReverse()) { - // If we store to reverse consecutive memory locations, then we need - // to reverse the order of elements in the stored value. - StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); - // We don't want to update the value in the map as it might be used in - // another expression. So don't call resetVectorValue(StoredVal). - } - auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true); - if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - BlockInMaskParts[Part]); - else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); - } - State.addMetadata(NewSI, SI); } - return; - } - // Handle loads. - assert(LI && "Must have a load instruction"); - State.setDebugLocFrom(getDebugLoc()); - for (unsigned Part = 0; Part < State.UF; ++Part) { - Value *NewLI; // TODO: split this into several classes for better design. if (State.EVL) { assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " @@ -9468,22 +9397,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { // is not nullptr it also implies preference for predicated // vectorization. // FIXME: Support reverse loading after vp_reverse is added. - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; NewLI = lowerLoadUsingVectorIntrinsics( - Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter), - CreateGatherScatter, MaskPart, EVL, Alignment); - } else if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Builder, DataTy, State.get(getAddr(), Part, !CreateGather), + CreateGather, Mask, EVL, Alignment); + } else if (CreateGather) { Value *VectorGep = State.get(getAddr(), Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, + NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, Mask, nullptr, "wide.masked.gather"); State.addMetadata(NewLI, LI); } else { auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true); - if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad( - DataTy, VecPtr, Alignment, BlockInMaskParts[Part], - PoisonValue::get(DataTy), "wide.masked.load"); + if (Mask) + NewLI = Builder.CreateMaskedLoad(DataTy, VecPtr, Alignment, Mask, + PoisonValue::get(DataTy), + "wide.masked.load"); else NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); @@ -9494,7 +9421,69 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { NewLI = Builder.CreateVectorReverse(NewLI, "reverse"); } - State.set(getVPSingleValue(), NewLI, Part); + State.set(this, NewLI, Part); + } +} + +void VPWidenStoreRecipe::execute(VPTransformState &State) { + auto *SI = cast(&Ingredient); + + VPValue *StoredVPValue = getStoredValue(); + bool CreateScatter = !isConsecutive(); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + State.setDebugLocFrom(getDebugLoc()); + + for (unsigned Part = 0; Part < State.UF; ++Part) { + Instruction *NewSI = nullptr; + Value *Mask = nullptr; + if (auto *VPMask = getMask()) { + // Mask reversal is only needed for non-all-one (null) masks, as reverse + // of a null all-one mask is a null mask. + Mask = State.get(VPMask, Part); + if (isReverse()) + Mask = Builder.CreateVectorReverse(Mask, "reverse"); + } + + Value *StoredVal = State.get(StoredVPValue, Part); + if (isReverse()) { + assert(!State.EVL && "reversing not yet implemented with EVL"); + // If we store to reverse consecutive memory locations, then we need + // to reverse the order of elements in the stored value. + StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse"); + // We don't want to update the value in the map as it might be used in + // another expression. So don't call resetVectorValue(StoredVal). + } + // TODO: split this into several classes for better design. + if (State.EVL) { + assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " + "explicit vector length."); + assert(cast(State.EVL)->getOpcode() == + VPInstruction::ExplicitVectorLength && + "EVL must be VPInstruction::ExplicitVectorLength."); + Value *EVL = State.get(State.EVL, VPIteration(0, 0)); + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL + // is created only if TTI prefers predicated vectorization, thus if EVL + // is not nullptr it also implies preference for predicated + // vectorization. + // FIXME: Support reverse store after vp_reverse is added. + NewSI = lowerStoreUsingVectorIntrinsics( + Builder, State.get(getAddr(), Part, !CreateScatter), StoredVal, + CreateScatter, Mask, EVL, Alignment); + } else if (CreateScatter) { + Value *VectorGep = State.get(getAddr(), Part); + NewSI = + Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, Mask); + } else { + auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true); + if (Mask) + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask); + else + NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } + State.addMetadata(NewSI, SI); } } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 605b47fa0a46b8..b4c7ab02f928f0 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -69,9 +69,9 @@ class VPRecipeBuilder { /// Check if the load or store instruction \p I should widened for \p /// Range.Start and potentially masked. Such instructions are handled by a /// recipe that takes an additional VPInstruction for the mask. - VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I, - ArrayRef Operands, - VFRange &Range); + VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I, + ArrayRef Operands, + VFRange &Range); /// Check if an induction recipe should be constructed for \p Phi. If so build /// and return it. If not, return null. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index d86a81d4fb4c73..148227f1f1a57b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -875,7 +875,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { return true; case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPBranchOnMaskSC: - case VPRecipeBase::VPWidenMemoryInstructionSC: + case VPRecipeBase::VPWidenLoadSC: + case VPRecipeBase::VPWidenStoreSC: // TODO: Widened stores don't define a value, but widened loads do. Split // the recipes to be able to make widened loads VPSingleDefRecipes. return false; @@ -2280,68 +2281,62 @@ class VPPredInstPHIRecipe : public VPSingleDefRecipe { } }; -/// A Recipe for widening load/store operations. -/// The recipe uses the following VPValues: -/// - For load: Address, optional mask -/// - For store: Address, stored value, optional mask -/// TODO: We currently execute only per-part unless a specific instance is -/// provided. -class VPWidenMemoryInstructionRecipe : public VPRecipeBase { +/// A common base class for widening memory operations. An optional mask can be +/// provided as the last operand. +class VPWidenMemoryRecipe : public VPRecipeBase { +protected: Instruction &Ingredient; - // Whether the loaded-from / stored-to addresses are consecutive. + /// Whether the accessed addresses are consecutive. bool Consecutive; - // Whether the consecutive loaded/stored addresses are in reverse order. + /// Whether the consecutive accessed addresses are in reverse order. bool Reverse; + /// Whether the memory access is masked. + bool IsMasked = false; + void setMask(VPValue *Mask) { + assert(!IsMasked && "cannot re-set mask"); if (!Mask) return; addOperand(Mask); + IsMasked = true; } - bool isMasked() const { - return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; + VPWidenMemoryRecipe(const char unsigned SC, Instruction &I, + std::initializer_list Operands, + bool Consecutive, bool Reverse, DebugLoc DL) + : VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive), + Reverse(Reverse) { + assert((Consecutive || !Reverse) && "Reverse implies consecutive"); } public: - VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse, DebugLoc DL) - : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}, DL), - Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) { - assert((Consecutive || !Reverse) && "Reverse implies consecutive"); - new VPValue(this, &Load); - setMask(Mask); - } + VPWidenMemoryRecipe *clone() override = 0; - VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, - VPValue *StoredValue, VPValue *Mask, - bool Consecutive, bool Reverse, DebugLoc DL) - : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr, StoredValue}, - DL), - Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) { - assert((Consecutive || !Reverse) && "Reverse implies consecutive"); - setMask(Mask); + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPDef::VPWidenLoadSC || + R->getVPDefID() == VPDef::VPWidenStoreSC; } - VPWidenMemoryInstructionRecipe *clone() override { - if (isStore()) - return new VPWidenMemoryInstructionRecipe( - cast(Ingredient), getAddr(), getStoredValue(), getMask(), - Consecutive, Reverse, getDebugLoc()); - - return new VPWidenMemoryInstructionRecipe(cast(Ingredient), - getAddr(), getMask(), Consecutive, - Reverse, getDebugLoc()); + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast(U); + return R && classof(R); } - VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC) + /// Return whether the loaded-from / stored-to addresses are consecutive. + bool isConsecutive() const { return Consecutive; } + + /// Return whether the consecutive loaded/stored addresses are in reverse + /// order. + bool isReverse() const { return Reverse; } /// Return the address accessed by this recipe. - VPValue *getAddr() const { - return getOperand(0); // Address is the 1st, mandatory operand. - } + VPValue *getAddr() const { return getOperand(0); } + + /// Returns true if the recipe is masked. + bool isMasked() const { return IsMasked; } /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. @@ -2350,23 +2345,34 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; } - /// Returns true if this recipe is a store. - bool isStore() const { return isa(Ingredient); } + /// Generate the wide load/store. + void execute(VPTransformState &State) override { + llvm_unreachable("VPWidenMemoryRecipe should not be instantiated."); + } - /// Return the address accessed by this recipe. - VPValue *getStoredValue() const { - assert(isStore() && "Stored value only available for store instructions"); - return getOperand(1); // Stored value is the 2nd, mandatory operand. + Instruction &getIngredient() const { return Ingredient; } +}; + +/// A recipe for widening load operations, using the address to load from and an +/// optional mask. +struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { + VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, + bool Consecutive, bool Reverse, DebugLoc DL) + : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive, + Reverse, DL), + VPValue(this, &Load) { + setMask(Mask); } - // Return whether the loaded-from / stored-to addresses are consecutive. - bool isConsecutive() const { return Consecutive; } + VPWidenLoadRecipe *clone() override { + return new VPWidenLoadRecipe(cast(Ingredient), getAddr(), + getMask(), Consecutive, Reverse, + getDebugLoc()); + } - // Return whether the consecutive loaded/stored addresses are in reverse - // order. - bool isReverse() const { return Reverse; } + VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC); - /// Generate the wide load/store. + /// Generate a wide load or gather. void execute(VPTransformState &State) override; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2380,16 +2386,51 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { assert(is_contained(operands(), Op) && "Op must be an operand of the recipe"); - // Widened, consecutive memory operations only demand the first lane of - // their address, unless the same operand is also stored. That latter can - // happen with opaque pointers. - return Op == getAddr() && isConsecutive() && - (!isStore() || Op != getStoredValue()); + // Widened, consecutive loads operations only demand the first lane of + // their address. + return Op == getAddr() && isConsecutive(); } - - Instruction &getIngredient() const { return Ingredient; } }; +/// A recipe for widening store operations, using the stored value, the address +/// to store to and an optional mask. +struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { + VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal, + VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL) + : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal}, + Consecutive, Reverse, DL) { + setMask(Mask); + } + + VPWidenStoreRecipe *clone() override { + return new VPWidenStoreRecipe(cast(Ingredient), getAddr(), + getStoredValue(), getMask(), Consecutive, + Reverse, getDebugLoc()); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC); + + /// Return the value stored by this recipe. + VPValue *getStoredValue() const { return getOperand(1); } + + /// Generate a wide store or scatter. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + // Widened, consecutive stores only demand the first lane of their address, + // unless the same operand is also stored. + return Op == getAddr() && isConsecutive() && Op != getStoredValue(); + } +}; /// Recipe to expand a SCEV expression. class VPExpandSCEVRecipe : public VPSingleDefRecipe { const SCEV *Expr; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index c8ae2ee5a30fe5..130fb04f586e75 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -108,9 +108,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { return CI.getType(); } -Type *VPTypeAnalysis::inferScalarTypeForRecipe( - const VPWidenMemoryInstructionRecipe *R) { - assert(!R->isStore() && "Store recipes should not define any values"); +Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { + assert(isa(R) && + "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } @@ -231,8 +231,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { return inferScalarType(R->getOperand(0)); }) .Case( + VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>( [this](const auto *R) { return inferScalarTypeForRecipe(R); }) .Case([V](const VPInterleaveRecipe *R) { // TODO: Use info from interleave group. diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index 4e69de7fd6812b..7d310b1b31b6fe 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -20,7 +20,7 @@ class VPInstruction; class VPWidenRecipe; class VPWidenCallRecipe; class VPWidenIntOrFpInductionRecipe; -class VPWidenMemoryInstructionRecipe; +class VPWidenMemoryRecipe; struct VPWidenSelectRecipe; class VPReplicateRecipe; class Type; @@ -46,7 +46,7 @@ class VPTypeAnalysis { Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R); Type *inferScalarTypeForRecipe(const VPWidenRecipe *R); Type *inferScalarTypeForRecipe(const VPWidenIntOrFpInductionRecipe *R); - Type *inferScalarTypeForRecipe(const VPWidenMemoryInstructionRecipe *R); + Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R); Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R); Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9f242a1bee8f6c..78932643c81fa3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -47,9 +47,8 @@ bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { case VPInterleaveSC: return cast(this)->getNumStoreOperands() > 0; - case VPWidenMemoryInstructionSC: { - return cast(this)->isStore(); - } + case VPWidenStoreSC: + return true; case VPReplicateSC: case VPWidenCallSC: return cast(getVPSingleValue()->getUnderlyingValue()) @@ -64,6 +63,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: + case VPWidenLoadSC: case VPWidenPHISC: case VPWidenSC: case VPWidenSelectSC: { @@ -81,16 +81,16 @@ bool VPRecipeBase::mayWriteToMemory() const { bool VPRecipeBase::mayReadFromMemory() const { switch (getVPDefID()) { - case VPWidenMemoryInstructionSC: { - return !cast(this)->isStore(); - } + case VPWidenLoadSC: + return true; case VPReplicateSC: case VPWidenCallSC: return cast(getVPSingleValue()->getUnderlyingValue()) ->mayReadFromMemory(); case VPBranchOnMaskSC: - case VPScalarIVStepsSC: case VPPredInstPHISC: + case VPScalarIVStepsSC: + case VPWidenStoreSC: return false; case VPBlendSC: case VPReductionSC: @@ -155,12 +155,13 @@ bool VPRecipeBase::mayHaveSideEffects() const { } case VPInterleaveSC: return mayWriteToMemory(); - case VPWidenMemoryInstructionSC: - assert(cast(this) - ->getIngredient() - .mayHaveSideEffects() == mayWriteToMemory() && - "mayHaveSideffects result for ingredient differs from this " - "implementation"); + case VPWidenLoadSC: + case VPWidenStoreSC: + assert( + cast(this)->getIngredient().mayHaveSideEffects() == + mayWriteToMemory() && + "mayHaveSideffects result for ingredient differs from this " + "implementation"); return mayWriteToMemory(); case VPReplicateSC: { auto *R = cast(this); @@ -1769,16 +1770,17 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } -void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { +void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = load "; + printOperands(O, SlotTracker); +} - if (!isStore()) { - getVPSingleValue()->printAsOperand(O, SlotTracker); - O << " = "; - } - O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; - +void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN store "; printOperands(O, SlotTracker); } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1256e4d8fda50b..382bf5ac114053 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -60,14 +60,14 @@ void VPlanTransforms::VPInstructionsToVPRecipes( assert(isa(&Ingredient) && "only VPInstructions expected here"); assert(!isa(Inst) && "phis should be handled above"); - // Create VPWidenMemoryInstructionRecipe for loads and stores. + // Create VPWidenMemoryRecipe for loads and stores. if (LoadInst *Load = dyn_cast(Inst)) { - NewRecipe = new VPWidenMemoryInstructionRecipe( + NewRecipe = new VPWidenLoadRecipe( *Load, Ingredient.getOperand(0), nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, Ingredient.getDebugLoc()); } else if (StoreInst *Store = dyn_cast(Inst)) { - NewRecipe = new VPWidenMemoryInstructionRecipe( + NewRecipe = new VPWidenStoreRecipe( *Store, Ingredient.getOperand(1), Ingredient.getOperand(0), nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, Ingredient.getDebugLoc()); @@ -977,10 +977,9 @@ void VPlanTransforms::truncateToMinimalBitwidths( vp_depth_first_deep(Plan.getVectorLoopRegion()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { if (!isa(&R)) + VPWidenSelectRecipe, VPWidenMemoryRecipe>(&R)) continue; - if (isa(&R) && - cast(&R)->isStore()) + if (isa(&R)) continue; VPValue *ResultVPV = R.getVPSingleValue(); @@ -1048,10 +1047,9 @@ void VPlanTransforms::truncateToMinimalBitwidths( assert(cast(&R)->getOpcode() == Instruction::ICmp && "Only ICmps should not need extending the result."); - if (isa(&R)) { - assert(!cast(&R)->isStore() && "stores cannot be narrowed"); + assert(!isa(&R) && "stores cannot be narrowed"); + if (isa(&R)) continue; - } // Shrink operands by introducing truncates as needed. unsigned StartIdx = isa(&R) ? 1 : 0; @@ -1315,7 +1313,7 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { ConstantInt::getTrue(CanonicalIVPHI->getScalarType()->getContext()); VPValue *VPTrueMask = Plan.getOrAddLiveIn(TrueMask); replaceHeaderPredicateWith(Plan, *VPTrueMask, [](VPUser &U, unsigned) { - return isa(U); + return isa(U); }); // Now create the ExplicitVectorLengthPhi recipe in the main loop. auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc()); @@ -1371,8 +1369,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( // instruction. Widen memory instructions involved in address computation // will lead to gather/scatter instructions, which don't need to be // handled. - if (isa(CurRec) || - isa(CurRec) || + if (isa(CurRec) || isa(CurRec) || isa(CurRec) || isa(CurRec)) continue; @@ -1420,7 +1417,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( auto Iter = vp_depth_first_deep(Plan.getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { for (VPRecipeBase &Recipe : *VPBB) { - if (auto *WidenRec = dyn_cast(&Recipe)) { + if (auto *WidenRec = dyn_cast(&Recipe)) { Instruction &UnderlyingInstr = WidenRec->getIngredient(); VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe(); if (AddrDef && WidenRec->isConsecutive() && diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 3f8d4f4fe7d647..0bbc7ffb4a2fe0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -36,7 +36,6 @@ class VPDef; class VPSlotTracker; class VPUser; class VPRecipeBase; -class VPWidenMemoryInstructionRecipe; // This is the base class of the VPlan Def/Use graph, used for modeling the data // flow into, within and out of the VPlan. VPValues can stand for live-ins @@ -51,7 +50,6 @@ class VPValue { friend class VPInterleavedAccessInfo; friend class VPSlotTracker; friend class VPRecipeBase; - friend class VPWidenMemoryInstructionRecipe; const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -358,7 +356,8 @@ class VPDef { VPWidenCanonicalIVSC, VPWidenCastSC, VPWidenGEPSC, - VPWidenMemoryInstructionSC, + VPWidenLoadSC, + VPWidenStoreSC, VPWidenSC, VPWidenSelectSC, VPBlendSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 12d37fa711db9f..5587302207acdb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -128,7 +128,7 @@ static bool verifyVPBasicBlock(const VPBasicBlock *VPBB, } return true; } - if (isa(R)) + if (isa(R)) VPWidenMemRecipe = R; return true; }; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll index d5ace655fdcc12..c22613509be4fe 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -46,8 +46,8 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -24 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -56 ; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]], <4 x double> poison) +; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE4]], <4 x double> poison) ; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index eea2894f827940..aea72b7de5f425 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -1400,15 +1400,15 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -12 ; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -3 ; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP18]], <4 x i1> poison, <4 x i32> -; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP19]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP25]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope !21 ; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP27]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope !21 ; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP18]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope !21 ; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> +; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP19]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !21 ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[REVERSE13]], @@ -1524,15 +1524,15 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -24 ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -7 ; AVX512-NEXT: [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP18]], <8 x i1> poison, <8 x i32> -; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP19]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP25]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP27]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP18]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> +; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP19]], <8 x i1> poison, <8 x i32> ; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34 ; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> ; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[REVERSE13]], diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 777675b623f32a..2b25c62ac2f65d 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -192,9 +192,9 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { auto Iter = VecBB->begin(); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); - EXPECT_NE(nullptr, dyn_cast(&*Iter++)); + EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); - EXPECT_NE(nullptr, dyn_cast(&*Iter++)); + EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); EXPECT_NE(nullptr, dyn_cast(&*Iter++)); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index cb8737a9e64d2f..64e9c06db3fe8b 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1029,7 +1029,7 @@ TEST(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) { EXPECT_EQ(&Recipe, BaseR); } -TEST(VPRecipeTest, CastVPWidenMemoryInstructionRecipeToVPUserAndVPDef) { +TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) { LLVMContext C; IntegerType *Int32 = IntegerType::get(C, 32); @@ -1038,7 +1038,7 @@ TEST(VPRecipeTest, CastVPWidenMemoryInstructionRecipeToVPUserAndVPDef) { new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1)); VPValue Addr; VPValue Mask; - VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false, {}); + VPWidenLoadRecipe Recipe(*Load, &Addr, &Mask, true, false, {}); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR)); @@ -1133,7 +1133,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1)); VPValue Addr; VPValue Mask; - VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false, {}); + VPWidenLoadRecipe Recipe(*Load, &Addr, &Mask, true, false, {}); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_TRUE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); @@ -1147,8 +1147,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue Addr; VPValue Mask; VPValue StoredV; - VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask, false, - false, {}); + VPWidenStoreRecipe Recipe(*Store, &Addr, &StoredV, &Mask, false, false, {}); EXPECT_TRUE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_TRUE(Recipe.mayWriteToMemory()); From f4737a2edd900df661750116821806bb45e4086a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Wed, 17 Apr 2024 12:24:32 +0200 Subject: [PATCH 232/300] update_test_checks: keep names stable with generated functions (#87988) Collect the original check lines in a manner that is independent of where the check lines appear in the file. This is so that we keep FileCheck variable names stable even when --include-generated-funcs is used. Reported-by: Ruiling Song --- .../Inputs/stable_ir_values_funcs.ll.expected | 10 ++-- llvm/utils/UpdateTestChecks/common.py | 59 +++++++++++-------- 2 files changed, 40 insertions(+), 29 deletions(-) diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected index 1559319ac013a2..86f929ffe36af6 100644 --- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected @@ -16,9 +16,9 @@ define i32 @func({i32, i32} %x, i32 %y) { ; CHECK-LABEL: define i32 @func( ; CHECK-SAME: { i32, i32 } [[X:%.*]], i32 [[Y:%.*]]) { -; CHECK-NEXT: [[X_I34:%.*]] = extractvalue { i32, i32 } [[X]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[Y]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[X_I34]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 3 -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: [[X_I33:%.*]] = extractvalue { i32, i32 } [[X]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[Y]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[X_I33]], [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[TMP1]], 3 +; CHECK-NEXT: ret i32 [[TMP2]] ; diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index eed36a0cdd73fd..15d3d5e527d61e 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -430,36 +430,47 @@ def collect_original_check_lines(ti: TestInfo, prefix_set: set): result[func_name][prefix] is filled with a list of right-hand-sides of check lines. """ - result = {} + result = collections.defaultdict(lambda: {}) + current_prefix = None current_function = None for input_line_info in ti.ro_iterlines(): input_line = input_line_info.line - if current_function is not None: - if input_line == "": - continue - if input_line.lstrip().startswith(";"): - m = CHECK_RE.match(input_line) - if ( - m is not None - and m.group(1) in prefix_set - and m.group(2) not in ["LABEL", "SAME"] - ): - if m.group(1) not in current_function: - current_function[m.group(1)] = [] - current_function[m.group(1)].append(input_line[m.end() :].strip()) - continue - current_function = None + if input_line.lstrip().startswith(";"): + m = CHECK_RE.match(input_line) + if m is not None: + prefix = m.group(1) + check_kind = m.group(2) + line = input_line[m.end() :].strip() + + if prefix != current_prefix: + current_function = None + current_prefix = None + + if check_kind not in ["LABEL", "SAME"]: + if current_function is not None: + current_function.append(line) + continue - m = IR_FUNCTION_RE.match(input_line) - if m is not None: - func_name = m.group(1) - if ti.args.function is not None and func_name != ti.args.function: - # When filtering on a specific function, skip all others. - continue + if check_kind == "SAME": + continue + + if check_kind == "LABEL": + m = IR_FUNCTION_RE.match(line) + if m is not None: + func_name = m.group(1) + if ( + ti.args.function is not None + and func_name != ti.args.function + ): + # When filtering on a specific function, skip all others. + continue + + current_prefix = prefix + current_function = result[func_name][prefix] = [] + continue - assert func_name not in result - current_function = result[func_name] = {} + current_function = None return result From 3eb0ba34b0a2a29c2f34ead2b84fdf9b62cb29c1 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Wed, 17 Apr 2024 11:28:30 +0100 Subject: [PATCH 233/300] [MLIR][Flang][OpenMP] Make omp.simdloop into a loop wrapper (#87365) This patch updates the definition of `omp.simdloop` to enforce the restrictions of a wrapper operation. It has been renamed to `omp.simd`, to better reflect the naming used in the spec. All uses of "simdloop" in function names have been updated accordingly. Some changes to Flang lowering and OpenMP to LLVM IR translation are introduced to prevent the introduction of compilation/test failures. The eventual long term solution might be different. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 167 +++++++++----- .../Fir/convert-to-llvm-openmp-and-fir.fir | 101 +++++---- flang/test/Lower/OpenMP/FIR/if-clause.f90 | 23 +- flang/test/Lower/OpenMP/FIR/loop-combined.f90 | 2 +- .../OpenMP/FIR/parallel-private-clause.f90 | 3 +- flang/test/Lower/OpenMP/FIR/simd.f90 | 109 +++++---- flang/test/Lower/OpenMP/if-clause.f90 | 23 +- flang/test/Lower/OpenMP/loop-combined.f90 | 2 +- .../Lower/OpenMP/parallel-private-clause.f90 | 3 +- flang/test/Lower/OpenMP/simd.f90 | 123 +++++----- .../Frontend/OpenMPIRBuilderTest.cpp | 2 +- .../Dialect/OpenMP/OpenMPClauseOperands.h | 9 +- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 58 ++--- .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp | 17 +- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 45 ++-- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 41 ++-- .../OpenMPToLLVM/convert-to-llvmir.mlir | 31 +-- mlir/test/Dialect/OpenMP/invalid.mlir | 163 +++++++------ mlir/test/Dialect/OpenMP/ops.mlir | 214 +++++++++--------- mlir/test/Target/LLVMIR/openmp-llvm.mlir | 157 +++++++------ 20 files changed, 701 insertions(+), 592 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 9b997522366621..c31d63625dbb17 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -502,8 +502,10 @@ struct OpWithBodyGenInfo { OpWithBodyGenInfo(Fortran::lower::AbstractConverter &converter, Fortran::semantics::SemanticsContext &semaCtx, - mlir::Location loc, Fortran::lower::pft::Evaluation &eval) - : converter(converter), semaCtx(semaCtx), loc(loc), eval(eval) {} + mlir::Location loc, Fortran::lower::pft::Evaluation &eval, + llvm::omp::Directive dir) + : converter(converter), semaCtx(semaCtx), loc(loc), eval(eval), dir(dir) { + } OpWithBodyGenInfo &setGenNested(bool value) { genNested = value; @@ -546,6 +548,8 @@ struct OpWithBodyGenInfo { mlir::Location loc; /// [in] current PFT node/evaluation. Fortran::lower::pft::Evaluation &eval; + /// [in] leaf directive for which to generate the op body. + llvm::omp::Directive dir; /// [in] whether to generate FIR for nested evaluations bool genNested = true; /// [in] is this an outer operation - prevents privatization. @@ -568,8 +572,7 @@ struct OpWithBodyGenInfo { /// /// \param [in] op - the operation the body belongs to. /// \param [in] info - options controlling code-gen for the construction. -template -static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) { +static void createBodyOfOp(mlir::Operation &op, OpWithBodyGenInfo &info) { fir::FirOpBuilder &firOpBuilder = info.converter.getFirOpBuilder(); auto insertMarker = [](fir::FirOpBuilder &builder) { @@ -585,10 +588,10 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) { auto regionArgs = [&]() -> llvm::SmallVector { if (info.genRegionEntryCB != nullptr) { - return info.genRegionEntryCB(op); + return info.genRegionEntryCB(&op); } - firOpBuilder.createBlock(&op.getRegion()); + firOpBuilder.createBlock(&op.getRegion(0)); return {}; }(); // Mark the earliest insertion point. @@ -603,8 +606,8 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) { // Start with privatization, so that the lowering of the nested // code will use the right symbols. - constexpr bool isLoop = std::is_same_v || - std::is_same_v; + bool isLoop = llvm::omp::getDirectiveAssociation(info.dir) == + llvm::omp::Association::Loop; bool privatize = info.clauses && !info.outerCombined; firOpBuilder.setInsertionPoint(marker); @@ -616,7 +619,7 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) { } } - if constexpr (std::is_same_v) { + if (info.dir == llvm::omp::Directive::OMPD_parallel) { threadPrivatizeVars(info.converter, info.eval); if (info.clauses) { firOpBuilder.setInsertionPoint(marker); @@ -630,9 +633,9 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) { // a lot of complications for our approach if the terminator generation // is delayed past this point. Insert a temporary terminator here, then // delete it. - firOpBuilder.setInsertionPointToEnd(&op.getRegion().back()); - auto *temp = Fortran::lower::genOpenMPTerminator( - firOpBuilder, op.getOperation(), info.loc); + firOpBuilder.setInsertionPointToEnd(&op.getRegion(0).back()); + auto *temp = + Fortran::lower::genOpenMPTerminator(firOpBuilder, &op, info.loc); firOpBuilder.setInsertionPointAfter(marker); genNestedEvaluations(info.converter, info.eval); temp->erase(); @@ -674,23 +677,36 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) { return exit; }; - if (auto *exitBlock = getUniqueExit(op.getRegion())) { + if (auto *exitBlock = getUniqueExit(op.getRegion(0))) { firOpBuilder.setInsertionPointToEnd(exitBlock); - auto *term = Fortran::lower::genOpenMPTerminator( - firOpBuilder, op.getOperation(), info.loc); + auto *term = + Fortran::lower::genOpenMPTerminator(firOpBuilder, &op, info.loc); // Only insert lastprivate code when there actually is an exit block. // Such a block may not exist if the nested code produced an infinite // loop (this may not make sense in production code, but a user could // write that and we should handle it). firOpBuilder.setInsertionPoint(term); if (privatize) { + // DataSharingProcessor::processStep2() may create operations before/after + // the one passed as argument. We need to treat loop wrappers and their + // nested loop as a unit, so we need to pass the top level wrapper (if + // present). Otherwise, these operations will be inserted within a + // wrapper region. + mlir::Operation *privatizationTopLevelOp = &op; + if (auto loopNest = llvm::dyn_cast(op)) { + llvm::SmallVector wrappers; + loopNest.gatherWrappers(wrappers); + if (!wrappers.empty()) + privatizationTopLevelOp = &*wrappers.back(); + } + if (!info.dsp) { assert(tempDsp.has_value()); - tempDsp->processStep2(op, isLoop); + tempDsp->processStep2(privatizationTopLevelOp, isLoop); } else { if (isLoop && regionArgs.size() > 0) info.dsp->setLoopIV(info.converter.getSymbolAddress(*regionArgs[0])); - info.dsp->processStep2(op, isLoop); + info.dsp->processStep2(privatizationTopLevelOp, isLoop); } } } @@ -921,7 +937,7 @@ template static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) { auto op = info.converter.getFirOpBuilder().create( info.loc, std::forward(args)...); - createBodyOfOp(op, info); + createBodyOfOp(*op, info); return op; } @@ -954,6 +970,18 @@ static void genFlushClauses( TODO(converter.getCurrentLocation(), "Handle OmpMemoryOrderClause"); } +static void genLoopNestClauses( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OmpClauseList &clauses, mlir::Location loc, + mlir::omp::LoopNestClauseOps &clauseOps, + llvm::SmallVectorImpl &iv) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processCollapse(loc, eval, clauseOps, iv); + clauseOps.loopInclusiveAttr = converter.getFirOpBuilder().getUnitAttr(); +} + static void genOrderedRegionClauses(Fortran::lower::AbstractConverter &converter, Fortran::semantics::SemanticsContext &semaCtx, @@ -1002,21 +1030,16 @@ static void genSectionsClauses(Fortran::lower::AbstractConverter &converter, } } -static void genSimdLoopClauses( - Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::StatementContext &stmtCtx, - Fortran::lower::pft::Evaluation &eval, - const Fortran::parser::OmpClauseList &clauses, mlir::Location loc, - mlir::omp::SimdLoopClauseOps &clauseOps, - llvm::SmallVectorImpl &iv) { +static void genSimdClauses(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + const Fortran::parser::OmpClauseList &clauses, + mlir::Location loc, + mlir::omp::SimdClauseOps &clauseOps) { ClauseProcessor cp(converter, semaCtx, clauses); - cp.processCollapse(loc, eval, clauseOps, iv); cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps); cp.processReduction(loc, clauseOps); cp.processSafelen(clauseOps); cp.processSimdlen(clauseOps); - clauseOps.loopInclusiveAttr = converter.getFirOpBuilder().getUnitAttr(); // TODO Support delayed privatization. cp.processTODO( - OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(genNested), + OpWithBodyGenInfo(converter, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_critical) + .setGenNested(genNested), nameAttr); } @@ -1295,7 +1320,9 @@ genMasterOp(Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval, bool genNested, mlir::Location loc) { return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(genNested)); + OpWithBodyGenInfo(converter, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_master) + .setGenNested(genNested)); } static mlir::omp::OrderedOp @@ -1317,7 +1344,9 @@ genOrderedRegionOp(Fortran::lower::AbstractConverter &converter, genOrderedRegionClauses(converter, semaCtx, clauseList, loc, clauseOps); return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(genNested), + OpWithBodyGenInfo(converter, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_ordered) + .setGenNested(genNested), clauseOps); } @@ -1345,7 +1374,8 @@ genParallelOp(Fortran::lower::AbstractConverter &converter, }; OpWithBodyGenInfo genInfo = - OpWithBodyGenInfo(converter, semaCtx, loc, eval) + OpWithBodyGenInfo(converter, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_parallel) .setGenNested(genNested) .setOuterCombined(outerCombined) .setClauses(&clauseList) @@ -1408,7 +1438,8 @@ genSectionOp(Fortran::lower::AbstractConverter &converter, // Currently only private/firstprivate clause is handled, and // all privatization is done within `omp.section` operations. return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, eval) + OpWithBodyGenInfo(converter, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_section) .setGenNested(genNested) .setClauses(&clauseList)); } @@ -1419,23 +1450,39 @@ genSectionsOp(Fortran::lower::AbstractConverter &converter, Fortran::lower::pft::Evaluation &eval, mlir::Location loc, const mlir::omp::SectionsClauseOps &clauseOps) { return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, eval).setGenNested(false), + OpWithBodyGenInfo(converter, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_sections) + .setGenNested(false), clauseOps); } -static mlir::omp::SimdLoopOp -genSimdLoopOp(Fortran::lower::AbstractConverter &converter, - Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, mlir::Location loc, - const Fortran::parser::OmpClauseList &clauseList) { +static mlir::omp::SimdOp +genSimdOp(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, mlir::Location loc, + const Fortran::parser::OmpClauseList &clauseList) { + fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); DataSharingProcessor dsp(converter, semaCtx, clauseList, eval); dsp.processStep1(); Fortran::lower::StatementContext stmtCtx; - mlir::omp::SimdLoopClauseOps clauseOps; + mlir::omp::LoopNestClauseOps loopClauseOps; + mlir::omp::SimdClauseOps simdClauseOps; llvm::SmallVector iv; - genSimdLoopClauses(converter, semaCtx, stmtCtx, eval, clauseList, loc, - clauseOps, iv); + genLoopNestClauses(converter, semaCtx, eval, clauseList, loc, loopClauseOps, + iv); + genSimdClauses(converter, semaCtx, clauseList, loc, simdClauseOps); + + // Create omp.simd wrapper. + auto simdOp = firOpBuilder.create(loc, simdClauseOps); + + // TODO: Add reduction-related arguments to the wrapper's entry block. + firOpBuilder.createBlock(&simdOp.getRegion()); + firOpBuilder.setInsertionPoint( + Fortran::lower::genOpenMPTerminator(firOpBuilder, simdOp, loc)); + + // Create nested omp.loop_nest and fill body with loop contents. + auto loopOp = firOpBuilder.create(loc, loopClauseOps); auto *nestedEval = getCollapsedLoopEval(eval, Fortran::lower::getCollapseValue(clauseList)); @@ -1444,12 +1491,14 @@ genSimdLoopOp(Fortran::lower::AbstractConverter &converter, return genLoopVars(op, converter, loc, iv); }; - return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval) - .setClauses(&clauseList) - .setDataSharingProcessor(&dsp) - .setGenRegionEntryCb(ivCallback), - clauseOps); + createBodyOfOp(*loopOp, + OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval, + llvm::omp::Directive::OMPD_simd) + .setClauses(&clauseList) + .setDataSharingProcessor(&dsp) + .setGenRegionEntryCb(ivCallback)); + + return simdOp; } static mlir::omp::SingleOp @@ -1464,7 +1513,8 @@ genSingleOp(Fortran::lower::AbstractConverter &converter, clauseOps); return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, eval) + OpWithBodyGenInfo(converter, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_single) .setGenNested(genNested) .setClauses(&beginClauseList), clauseOps); @@ -1645,7 +1695,8 @@ genTaskOp(Fortran::lower::AbstractConverter &converter, genTaskClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps); return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, eval) + OpWithBodyGenInfo(converter, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_task) .setGenNested(genNested) .setClauses(&clauseList), clauseOps); @@ -1661,7 +1712,8 @@ genTaskgroupOp(Fortran::lower::AbstractConverter &converter, genTaskgroupClauses(converter, semaCtx, clauseList, loc, clauseOps); return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, eval) + OpWithBodyGenInfo(converter, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_taskgroup) .setGenNested(genNested) .setClauses(&clauseList), clauseOps); @@ -1704,7 +1756,8 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter, genTeamsClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps); return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, eval) + OpWithBodyGenInfo(converter, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_teams) .setGenNested(genNested) .setOuterCombined(outerCombined) .setClauses(&clauseList), @@ -1738,7 +1791,8 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter, }; return genOpWithBody( - OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval) + OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval, + llvm::omp::Directive::OMPD_do) .setClauses(&beginClauseList) .setDataSharingProcessor(&dsp) .setReductions(&reductionSyms, &reductionTypes) @@ -2253,7 +2307,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, endClauseList, currentLocation); } else if (llvm::omp::allSimdSet.test(ompDirective)) { // 2.9.3.1 SIMD construct - genSimdLoopOp(converter, semaCtx, eval, currentLocation, beginClauseList); + genSimdOp(converter, semaCtx, eval, currentLocation, beginClauseList); } else { genWsloopOp(converter, semaCtx, eval, currentLocation, beginClauseList, endClauseList); @@ -2341,10 +2395,9 @@ mlir::Operation *Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder, mlir::Operation *op, mlir::Location loc) { if (mlir::isa(op)) + mlir::omp::AtomicUpdateOp, mlir::omp::LoopNestOp>(op)) return builder.create(loc); - else - return builder.create(loc); + return builder.create(loc); } void Fortran::lower::genOpenMPConstruct( diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir index 92628af37085a5..fa7979e8875afc 100644 --- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir +++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir @@ -180,14 +180,16 @@ func.func @_QPsimd1(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref omp.parallel { %1 = fir.alloca i32 {adapt.valuebyref, pinned} %2 = fir.load %arg0 : !fir.ref - omp.simdloop for (%arg2) : i32 = (%c1_i32) to (%2) step (%c1_i32) { - fir.store %arg2 to %1 : !fir.ref - %3 = fir.load %1 : !fir.ref - %4 = fir.convert %3 : (i32) -> i64 - %5 = arith.subi %4, %c1_i64 : i64 - %6 = fir.coordinate_of %arg1, %5 : (!fir.ref>, i64) -> !fir.ref - fir.store %3 to %6 : !fir.ref - omp.yield + omp.simd { + omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%2) step (%c1_i32) { + fir.store %arg2 to %1 : !fir.ref + %3 = fir.load %1 : !fir.ref + %4 = fir.convert %3 : (i32) -> i64 + %5 = arith.subi %4, %c1_i64 : i64 + %6 = fir.coordinate_of %arg1, %5 : (!fir.ref>, i64) -> !fir.ref + fir.store %3 to %6 : !fir.ref + omp.yield + } } omp.terminator } @@ -202,8 +204,8 @@ func.func @_QPsimd1(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref // CHECK: %[[ONE_3:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {pinned} : (i64) -> !llvm.ptr // CHECK: %[[N:.*]] = llvm.load %[[N_REF]] : !llvm.ptr -> i32 -// CHECK: omp.simdloop -// CHECK-SAME: (%[[I:.*]]) : i32 = (%[[ONE_2]]) to (%[[N]]) step (%[[ONE_2]]) { +// CHECK: omp.simd { +// CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[ONE_2]]) to (%[[N]]) step (%[[ONE_2]]) { // CHECK: llvm.store %[[I]], %[[I_VAR]] : i32, !llvm.ptr // CHECK: %[[I1:.*]] = llvm.load %[[I_VAR]] : !llvm.ptr -> i32 // CHECK: %[[I1_EXT:.*]] = llvm.sext %[[I1]] : i32 to i64 @@ -212,6 +214,7 @@ func.func @_QPsimd1(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref // CHECK: llvm.store %[[I1]], %[[ARR_I_REF]] : i32, !llvm.ptr // CHECK: omp.yield // CHECK: } +// CHECK: } // CHECK: omp.terminator // CHECK: } // CHECK: llvm.return @@ -471,55 +474,59 @@ func.func @_QPomp_target() { // ----- -func.func @_QPsimdloop_with_nested_loop() { +func.func @_QPsimd_with_nested_loop() { %0 = fir.alloca i32 {adapt.valuebyref} - %1 = fir.alloca !fir.array<10xi32> {bindc_name = "a", uniq_name = "_QFsimdloop_with_nested_loopEa"} - %2 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimdloop_with_nested_loopEi"} - %3 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFsimdloop_with_nested_loopEj"} + %1 = fir.alloca !fir.array<10xi32> {bindc_name = "a", uniq_name = "_QFsimd_with_nested_loopEa"} + %2 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimd_with_nested_loopEi"} + %3 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFsimd_with_nested_loopEj"} %c1_i32 = arith.constant 1 : i32 %c10_i32 = arith.constant 10 : i32 %c1_i32_0 = arith.constant 1 : i32 - omp.simdloop for (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32_0) { - fir.store %arg0 to %0 : !fir.ref - %c1_i32_1 = arith.constant 1 : i32 - %4 = fir.convert %c1_i32_1 : (i32) -> index - %c10_i32_2 = arith.constant 10 : i32 - %5 = fir.convert %c10_i32_2 : (i32) -> index - %c1 = arith.constant 1 : index - %6 = fir.do_loop %arg1 = %4 to %5 step %c1 -> index { - %8 = fir.convert %arg1 : (index) -> i32 - fir.store %8 to %3 : !fir.ref - %9 = fir.load %0 : !fir.ref - %10 = fir.load %0 : !fir.ref - %11 = fir.convert %10 : (i32) -> i64 - %c1_i64 = arith.constant 1 : i64 - %12 = arith.subi %11, %c1_i64 : i64 - %13 = fir.coordinate_of %1, %12 : (!fir.ref>, i64) -> !fir.ref - fir.store %9 to %13 : !fir.ref - %14 = arith.addi %arg1, %c1 : index - fir.result %14 : index + omp.simd { + omp.loop_nest (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32_0) { + fir.store %arg0 to %0 : !fir.ref + %c1_i32_1 = arith.constant 1 : i32 + %4 = fir.convert %c1_i32_1 : (i32) -> index + %c10_i32_2 = arith.constant 10 : i32 + %5 = fir.convert %c10_i32_2 : (i32) -> index + %c1 = arith.constant 1 : index + %6 = fir.do_loop %arg1 = %4 to %5 step %c1 -> index { + %8 = fir.convert %arg1 : (index) -> i32 + fir.store %8 to %3 : !fir.ref + %9 = fir.load %0 : !fir.ref + %10 = fir.load %0 : !fir.ref + %11 = fir.convert %10 : (i32) -> i64 + %c1_i64 = arith.constant 1 : i64 + %12 = arith.subi %11, %c1_i64 : i64 + %13 = fir.coordinate_of %1, %12 : (!fir.ref>, i64) -> !fir.ref + fir.store %9 to %13 : !fir.ref + %14 = arith.addi %arg1, %c1 : index + fir.result %14 : index + } + %7 = fir.convert %6 : (index) -> i32 + fir.store %7 to %3 : !fir.ref + omp.yield } - %7 = fir.convert %6 : (index) -> i32 - fir.store %7 to %3 : !fir.ref - omp.yield } return } -// CHECK-LABEL: llvm.func @_QPsimdloop_with_nested_loop() { +// CHECK-LABEL: llvm.func @_QPsimd_with_nested_loop() { // CHECK: %[[LOWER:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[UPPER:.*]] = llvm.mlir.constant(10 : i32) : i32 // CHECK: %[[STEP:.*]] = llvm.mlir.constant(1 : i32) : i32 -// CHECK: omp.simdloop for (%[[CNT:.*]]) : i32 = (%[[LOWER]]) to (%[[UPPER]]) inclusive step (%[[STEP]]) { -// CHECK: llvm.br ^bb1(%[[VAL_1:.*]], %[[VAL_2:.*]] : i64, i64) -// CHECK: ^bb1(%[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64): -// CHECK: %[[VAL_5:.*]] = llvm.mlir.constant(0 : index) : i64 -// CHECK: %[[VAL_6:.*]] = llvm.icmp "sgt" %[[VAL_4]], %[[VAL_5]] : i64 -// CHECK: llvm.cond_br %[[VAL_6]], ^bb2, ^bb3 -// CHECK: ^bb2: -// CHECK: llvm.br ^bb1(%[[VAL_7:.*]], %[[VAL_8:.*]] : i64, i64) -// CHECK: ^bb3: -// CHECK: omp.yield +// CHECK: omp.simd { +// CHECK-NEXT: omp.loop_nest (%[[CNT:.*]]) : i32 = (%[[LOWER]]) to (%[[UPPER]]) inclusive step (%[[STEP]]) { +// CHECK: llvm.br ^bb1(%[[VAL_1:.*]], %[[VAL_2:.*]] : i64, i64) +// CHECK: ^bb1(%[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64): +// CHECK: %[[VAL_5:.*]] = llvm.mlir.constant(0 : index) : i64 +// CHECK: %[[VAL_6:.*]] = llvm.icmp "sgt" %[[VAL_4]], %[[VAL_5]] : i64 +// CHECK: llvm.cond_br %[[VAL_6]], ^bb2, ^bb3 +// CHECK: ^bb2: +// CHECK: llvm.br ^bb1(%[[VAL_7:.*]], %[[VAL_8:.*]] : i64, i64) +// CHECK: ^bb3: +// CHECK: omp.yield +// CHECK: } // CHECK: } // CHECK: llvm.return // CHECK: } diff --git a/flang/test/Lower/OpenMP/FIR/if-clause.f90 b/flang/test/Lower/OpenMP/FIR/if-clause.f90 index a1235be8e61ea2..f686b9708fc54a 100644 --- a/flang/test/Lower/OpenMP/FIR/if-clause.f90 +++ b/flang/test/Lower/OpenMP/FIR/if-clause.f90 @@ -116,7 +116,7 @@ program main do i = 1, 10 end do !$omp end parallel do simd - + ! CHECK: omp.parallel ! CHECK-SAME: if({{.*}}) ! CHECK: omp.wsloop @@ -124,7 +124,7 @@ program main do i = 1, 10 end do !$omp end parallel do simd - + ! CHECK: omp.parallel ! CHECK-SAME: if({{.*}}) ! CHECK: omp.wsloop @@ -134,7 +134,7 @@ program main do i = 1, 10 end do !$omp end parallel do simd - + ! CHECK: omp.parallel ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { @@ -147,7 +147,7 @@ program main ! ---------------------------------------------------------------------------- ! SIMD ! ---------------------------------------------------------------------------- - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { !$omp simd @@ -155,14 +155,14 @@ program main end do !$omp end simd - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-SAME: if({{.*}}) !$omp simd if(.true.) do i = 1, 10 end do !$omp end simd - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-SAME: if({{.*}}) !$omp simd if(simd: .true.) do i = 1, 10 @@ -281,7 +281,6 @@ program main end do !$omp end target parallel do - ! CHECK: omp.target ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { @@ -360,7 +359,7 @@ program main ! CHECK: omp.target ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { !$omp target simd @@ -370,7 +369,7 @@ program main ! CHECK: omp.target ! CHECK-SAME: if({{.*}}) - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-SAME: if({{.*}}) !$omp target simd if(.true.) do i = 1, 10 @@ -379,7 +378,7 @@ program main ! CHECK: omp.target ! CHECK-SAME: if({{.*}}) - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-SAME: if({{.*}}) !$omp target simd if(target: .true.) if(simd: .false.) do i = 1, 10 @@ -388,7 +387,7 @@ program main ! CHECK: omp.target ! CHECK-SAME: if({{.*}}) - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { !$omp target simd if(target: .true.) @@ -399,7 +398,7 @@ program main ! CHECK: omp.target ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-SAME: if({{.*}}) !$omp target simd if(simd: .true.) do i = 1, 10 diff --git a/flang/test/Lower/OpenMP/FIR/loop-combined.f90 b/flang/test/Lower/OpenMP/FIR/loop-combined.f90 index a6cec1beb49c86..6c6618dc9fb573 100644 --- a/flang/test/Lower/OpenMP/FIR/loop-combined.f90 +++ b/flang/test/Lower/OpenMP/FIR/loop-combined.f90 @@ -75,7 +75,7 @@ program main ! TARGET SIMD ! ---------------------------------------------------------------------------- ! CHECK: omp.target - ! CHECK: omp.simdloop + ! CHECK: omp.simd !$omp target simd do i = 1, 10 end do diff --git a/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90 index 8f5d280943cc2e..8b75ecbaae8c73 100644 --- a/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90 +++ b/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90 @@ -361,7 +361,8 @@ subroutine simd_loop_1 ! FIRDialect: %[[UB:.*]] = arith.constant 9 : i32 ! FIRDialect: %[[STEP:.*]] = arith.constant 1 : i32 - ! FIRDialect: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! FIRDialect: omp.simd { + ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { !$OMP SIMD PRIVATE(r) do i=1, 9 ! FIRDialect: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref diff --git a/flang/test/Lower/OpenMP/FIR/simd.f90 b/flang/test/Lower/OpenMP/FIR/simd.f90 index c8c2022d693d46..db7d30295c45d9 100644 --- a/flang/test/Lower/OpenMP/FIR/simd.f90 +++ b/flang/test/Lower/OpenMP/FIR/simd.f90 @@ -2,32 +2,34 @@ ! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s -!CHECK-LABEL: func @_QPsimdloop() -subroutine simdloop -integer :: i +!CHECK-LABEL: func @_QPsimd() +subroutine simd + integer :: i !$OMP SIMD ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32 ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK-NEXT: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK-NEXT: omp.simd { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i=1, 9 ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref, i32) -> i1 print*, i end do - !$OMP END SIMD + !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_if_clause -subroutine simdloop_with_if_clause(n, threshold) -integer :: i, n, threshold +!CHECK-LABEL: func @_QPsimd_with_if_clause +subroutine simd_with_if_clause(n, threshold) + integer :: i, n, threshold !$OMP SIMD IF( n .GE. threshold ) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %arg0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 ! CHECK: %[[COND:.*]] = arith.cmpi sge - ! CHECK: omp.simdloop if(%[[COND:.*]]) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd if(%[[COND:.*]]) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref @@ -37,14 +39,15 @@ subroutine simdloop_with_if_clause(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause -subroutine simdloop_with_simdlen_clause(n, threshold) -integer :: i, n, threshold +!CHECK-LABEL: func @_QPsimd_with_simdlen_clause +subroutine simd_with_simdlen_clause(n, threshold) + integer :: i, n, threshold !$OMP SIMD SIMDLEN(2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %arg0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop simdlen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd simdlen(2) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref @@ -54,15 +57,16 @@ subroutine simdloop_with_simdlen_clause(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause_from_param -subroutine simdloop_with_simdlen_clause_from_param(n, threshold) -integer :: i, n, threshold -integer, parameter :: simdlen = 2; +!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_param +subroutine simd_with_simdlen_clause_from_param(n, threshold) + integer :: i, n, threshold + integer, parameter :: simdlen = 2; !$OMP SIMD SIMDLEN(simdlen) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %arg0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop simdlen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd simdlen(2) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref @@ -72,15 +76,16 @@ subroutine simdloop_with_simdlen_clause_from_param(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause_from_expr_from_param -subroutine simdloop_with_simdlen_clause_from_expr_from_param(n, threshold) -integer :: i, n, threshold -integer, parameter :: simdlen = 2; +!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_expr_from_param +subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold) + integer :: i, n, threshold + integer, parameter :: simdlen = 2; !$OMP SIMD SIMDLEN(simdlen*2 + 2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %arg0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop simdlen(6) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd simdlen(6) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref @@ -90,14 +95,15 @@ subroutine simdloop_with_simdlen_clause_from_expr_from_param(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_safelen_clause -subroutine simdloop_with_safelen_clause(n, threshold) -integer :: i, n, threshold +!CHECK-LABEL: func @_QPsimd_with_safelen_clause +subroutine simd_with_safelen_clause(n, threshold) + integer :: i, n, threshold !$OMP SIMD SAFELEN(2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %arg0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop safelen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd safelen(2) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref @@ -107,15 +113,16 @@ subroutine simdloop_with_safelen_clause(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_safelen_clause_from_expr_from_param -subroutine simdloop_with_safelen_clause_from_expr_from_param(n, threshold) -integer :: i, n, threshold -integer, parameter :: safelen = 2; +!CHECK-LABEL: func @_QPsimd_with_safelen_clause_from_expr_from_param +subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold) + integer :: i, n, threshold + integer, parameter :: safelen = 2; !$OMP SIMD SAFELEN(safelen*2 + 2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %arg0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop safelen(6) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd safelen(6) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref @@ -125,14 +132,15 @@ subroutine simdloop_with_safelen_clause_from_expr_from_param(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_simdlen_safelen_clause -subroutine simdloop_with_simdlen_safelen_clause(n, threshold) -integer :: i, n, threshold +!CHECK-LABEL: func @_QPsimd_with_simdlen_safelen_clause +subroutine simd_with_simdlen_safelen_clause(n, threshold) + integer :: i, n, threshold !$OMP SIMD SIMDLEN(1) SAFELEN(2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %arg0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop simdlen(1) safelen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd simdlen(1) safelen(2) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref @@ -142,20 +150,21 @@ subroutine simdloop_with_simdlen_safelen_clause(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_collapse_clause -subroutine simdloop_with_collapse_clause(n) -integer :: i, j, n -integer :: A(n,n) -! CHECK: %[[LOWER_I:.*]] = arith.constant 1 : i32 -! CHECK: %[[UPPER_I:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref -! CHECK: %[[STEP_I:.*]] = arith.constant 1 : i32 -! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32 -! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref -! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32 -! CHECK: omp.simdloop for (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = ( -! CHECK-SAME: %[[LOWER_I]], %[[LOWER_J]]) to ( -! CHECK-SAME: %[[UPPER_I]], %[[UPPER_J]]) inclusive step ( -! CHECK-SAME: %[[STEP_I]], %[[STEP_J]]) { +!CHECK-LABEL: func @_QPsimd_with_collapse_clause +subroutine simd_with_collapse_clause(n) + integer :: i, j, n + integer :: A(n,n) + ! CHECK: %[[LOWER_I:.*]] = arith.constant 1 : i32 + ! CHECK: %[[UPPER_I:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref + ! CHECK: %[[STEP_I:.*]] = arith.constant 1 : i32 + ! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32 + ! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref + ! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32 + ! CHECK: omp.simd { + ! CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = ( + ! CHECK-SAME: %[[LOWER_I]], %[[LOWER_J]]) to ( + ! CHECK-SAME: %[[UPPER_I]], %[[UPPER_J]]) inclusive step ( + ! CHECK-SAME: %[[STEP_I]], %[[STEP_J]]) { !$OMP SIMD COLLAPSE(2) do i = 1, n do j = 1, n diff --git a/flang/test/Lower/OpenMP/if-clause.f90 b/flang/test/Lower/OpenMP/if-clause.f90 index f982bf67b07225..ce4427a0c2cab2 100644 --- a/flang/test/Lower/OpenMP/if-clause.f90 +++ b/flang/test/Lower/OpenMP/if-clause.f90 @@ -116,7 +116,7 @@ program main do i = 1, 10 end do !$omp end parallel do simd - + ! CHECK: omp.parallel ! CHECK-SAME: if({{.*}}) ! CHECK: omp.wsloop @@ -124,7 +124,7 @@ program main do i = 1, 10 end do !$omp end parallel do simd - + ! CHECK: omp.parallel ! CHECK-SAME: if({{.*}}) ! CHECK: omp.wsloop @@ -134,7 +134,7 @@ program main do i = 1, 10 end do !$omp end parallel do simd - + ! CHECK: omp.parallel ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { @@ -147,7 +147,7 @@ program main ! ---------------------------------------------------------------------------- ! SIMD ! ---------------------------------------------------------------------------- - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { !$omp simd @@ -155,14 +155,14 @@ program main end do !$omp end simd - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-SAME: if({{.*}}) !$omp simd if(.true.) do i = 1, 10 end do !$omp end simd - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-SAME: if({{.*}}) !$omp simd if(simd: .true.) do i = 1, 10 @@ -281,7 +281,6 @@ program main end do !$omp end target parallel do - ! CHECK: omp.target ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { @@ -360,7 +359,7 @@ program main ! CHECK: omp.target ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { !$omp target simd @@ -370,7 +369,7 @@ program main ! CHECK: omp.target ! CHECK-SAME: if({{.*}}) - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-SAME: if({{.*}}) !$omp target simd if(.true.) do i = 1, 10 @@ -379,7 +378,7 @@ program main ! CHECK: omp.target ! CHECK-SAME: if({{.*}}) - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-SAME: if({{.*}}) !$omp target simd if(target: .true.) if(simd: .false.) do i = 1, 10 @@ -388,7 +387,7 @@ program main ! CHECK: omp.target ! CHECK-SAME: if({{.*}}) - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { !$omp target simd if(target: .true.) @@ -399,7 +398,7 @@ program main ! CHECK: omp.target ! CHECK-NOT: if({{.*}}) ! CHECK-SAME: { - ! CHECK: omp.simdloop + ! CHECK: omp.simd ! CHECK-SAME: if({{.*}}) !$omp target simd if(simd: .true.) do i = 1, 10 diff --git a/flang/test/Lower/OpenMP/loop-combined.f90 b/flang/test/Lower/OpenMP/loop-combined.f90 index 70488b6a769ce4..298634b3f6f825 100644 --- a/flang/test/Lower/OpenMP/loop-combined.f90 +++ b/flang/test/Lower/OpenMP/loop-combined.f90 @@ -75,7 +75,7 @@ program main ! TARGET SIMD ! ---------------------------------------------------------------------------- ! CHECK: omp.target - ! CHECK: omp.simdloop + ! CHECK: omp.simd !$omp target simd do i = 1, 10 end do diff --git a/flang/test/Lower/OpenMP/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/parallel-private-clause.f90 index 5578b6710da7cd..775f7b4f2cb106 100644 --- a/flang/test/Lower/OpenMP/parallel-private-clause.f90 +++ b/flang/test/Lower/OpenMP/parallel-private-clause.f90 @@ -411,7 +411,8 @@ subroutine simd_loop_1 ! FIRDialect: %[[UB:.*]] = arith.constant 9 : i32 ! FIRDialect: %[[STEP:.*]] = arith.constant 1 : i32 - ! FIRDialect: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! FIRDialect: omp.simd { + ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { !$OMP SIMD PRIVATE(r) do i=1, 9 ! FIRDialect: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90 index 135b38c792623e..190aa615212176 100644 --- a/flang/test/Lower/OpenMP/simd.f90 +++ b/flang/test/Lower/OpenMP/simd.f90 @@ -3,33 +3,35 @@ !RUN: %flang_fc1 -flang-experimental-hlfir -emit-hlfir -fopenmp %s -o - | FileCheck %s !RUN: bbc -hlfir -emit-hlfir -fopenmp %s -o - | FileCheck %s -!CHECK-LABEL: func @_QPsimdloop() -subroutine simdloop -integer :: i +!CHECK-LABEL: func @_QPsimd() +subroutine simd + integer :: i !$OMP SIMD ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32 ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK-NEXT: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK-NEXT: omp.simd { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i=1, 9 ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref, i32) -> i1 print*, i end do - !$OMP END SIMD + !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_if_clause -subroutine simdloop_with_if_clause(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_if_clauseEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) -integer :: i, n, threshold +!CHECK-LABEL: func @_QPsimd_with_if_clause +subroutine simd_with_if_clause(n, threshold) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_if_clauseEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) + integer :: i, n, threshold !$OMP SIMD IF( n .GE. threshold ) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 ! CHECK: %[[COND:.*]] = arith.cmpi sge - ! CHECK: omp.simdloop if(%[[COND:.*]]) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd if(%[[COND:.*]]) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref @@ -39,15 +41,16 @@ subroutine simdloop_with_if_clause(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause -subroutine simdloop_with_simdlen_clause(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_simdlen_clauseEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) -integer :: i, n, threshold +!CHECK-LABEL: func @_QPsimd_with_simdlen_clause +subroutine simd_with_simdlen_clause(n, threshold) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_clauseEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) + integer :: i, n, threshold !$OMP SIMD SIMDLEN(2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop simdlen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd simdlen(2) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref @@ -57,16 +60,17 @@ subroutine simdloop_with_simdlen_clause(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause_from_param -subroutine simdloop_with_simdlen_clause_from_param(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_simdlen_clause_from_paramEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) -integer :: i, n, threshold -integer, parameter :: simdlen = 2; +!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_param +subroutine simd_with_simdlen_clause_from_param(n, threshold) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_clause_from_paramEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) + integer :: i, n, threshold + integer, parameter :: simdlen = 2; !$OMP SIMD SIMDLEN(simdlen) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop simdlen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd simdlen(2) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref @@ -76,16 +80,17 @@ subroutine simdloop_with_simdlen_clause_from_param(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause_from_expr_from_param -subroutine simdloop_with_simdlen_clause_from_expr_from_param(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_simdlen_clause_from_expr_from_paramEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) -integer :: i, n, threshold -integer, parameter :: simdlen = 2; +!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_expr_from_param +subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_clause_from_expr_from_paramEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) + integer :: i, n, threshold + integer, parameter :: simdlen = 2; !$OMP SIMD SIMDLEN(simdlen*2 + 2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop simdlen(6) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd simdlen(6) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref @@ -95,15 +100,16 @@ subroutine simdloop_with_simdlen_clause_from_expr_from_param(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_safelen_clause -subroutine simdloop_with_safelen_clause(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_safelen_clauseEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) -integer :: i, n, threshold +!CHECK-LABEL: func @_QPsimd_with_safelen_clause +subroutine simd_with_safelen_clause(n, threshold) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_safelen_clauseEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) + integer :: i, n, threshold !$OMP SIMD SAFELEN(2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop safelen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd safelen(2) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref @@ -113,16 +119,17 @@ subroutine simdloop_with_safelen_clause(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_safelen_clause_from_expr_from_param -subroutine simdloop_with_safelen_clause_from_expr_from_param(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_safelen_clause_from_expr_from_paramEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) -integer :: i, n, threshold -integer, parameter :: safelen = 2; +!CHECK-LABEL: func @_QPsimd_with_safelen_clause_from_expr_from_param +subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_safelen_clause_from_expr_from_paramEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) + integer :: i, n, threshold + integer, parameter :: safelen = 2; !$OMP SIMD SAFELEN(safelen*2 + 2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop safelen(6) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd safelen(6) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref @@ -132,15 +139,16 @@ subroutine simdloop_with_safelen_clause_from_expr_from_param(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_simdlen_safelen_clause -subroutine simdloop_with_simdlen_safelen_clause(n, threshold) - ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_simdlen_safelen_clauseEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) -integer :: i, n, threshold +!CHECK-LABEL: func @_QPsimd_with_simdlen_safelen_clause +subroutine simd_with_simdlen_safelen_clause(n, threshold) + ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_safelen_clauseEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) + integer :: i, n, threshold !$OMP SIMD SIMDLEN(1) SAFELEN(2) ! CHECK: %[[LB:.*]] = arith.constant 1 : i32 ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0 ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32 - ! CHECK: omp.simdloop simdlen(1) safelen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK: omp.simd simdlen(1) safelen(2) { + ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { do i = 1, n ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref @@ -150,20 +158,21 @@ subroutine simdloop_with_simdlen_safelen_clause(n, threshold) !$OMP END SIMD end subroutine -!CHECK-LABEL: func @_QPsimdloop_with_collapse_clause -subroutine simdloop_with_collapse_clause(n) -integer :: i, j, n -integer :: A(n,n) -! CHECK: %[[LOWER_I:.*]] = arith.constant 1 : i32 -! CHECK: %[[UPPER_I:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref -! CHECK: %[[STEP_I:.*]] = arith.constant 1 : i32 -! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32 -! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref -! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32 -! CHECK: omp.simdloop for (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = ( -! CHECK-SAME: %[[LOWER_I]], %[[LOWER_J]]) to ( -! CHECK-SAME: %[[UPPER_I]], %[[UPPER_J]]) inclusive step ( -! CHECK-SAME: %[[STEP_I]], %[[STEP_J]]) { +!CHECK-LABEL: func @_QPsimd_with_collapse_clause +subroutine simd_with_collapse_clause(n) + integer :: i, j, n + integer :: A(n,n) + ! CHECK: %[[LOWER_I:.*]] = arith.constant 1 : i32 + ! CHECK: %[[UPPER_I:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref + ! CHECK: %[[STEP_I:.*]] = arith.constant 1 : i32 + ! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32 + ! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref + ! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32 + ! CHECK: omp.simd { + ! CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = ( + ! CHECK-SAME: %[[LOWER_I]], %[[LOWER_J]]) to ( + ! CHECK-SAME: %[[UPPER_I]], %[[UPPER_J]]) inclusive step ( + ! CHECK-SAME: %[[STEP_I]], %[[STEP_J]]) { !$OMP SIMD COLLAPSE(2) do i = 1, n do j = 1, n diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index db1c4a8951ad2a..8344bca08404e4 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -2097,7 +2097,7 @@ TEST_F(OpenMPIRBuilderTest, ApplySimdlenSafelen) { })); } -TEST_F(OpenMPIRBuilderTest, ApplySimdLoopIf) { +TEST_F(OpenMPIRBuilderTest, ApplySimdIf) { OpenMPIRBuilder OMPBuilder(*M); IRBuilder<> Builder(BB); MapVector AlignedVars; diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h index 27a766aceb3160..3c5fa23bd4a7f4 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h @@ -251,11 +251,10 @@ using SectionsClauseOps = detail::Clauses; // TODO `linear` clause. -using SimdLoopClauseOps = - detail::Clauses; +using SimdClauseOps = + detail::Clauses; using SingleClauseOps = detail::Clauses; diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 82be7ad31a158f..10771f6e854dde 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -562,7 +562,7 @@ def LoopNestOp : OpenMP_Op<"loop_nest", [SameVariadicOperandSize, loop operations intended to serve as a stopgap solution until the long-term representation of canonical loops is defined. Specifically, this operation is intended to serve as a unique source for loop information during the - transition to making `omp.distribute`, `omp.simdloop`, `omp.taskloop` and + transition to making `omp.distribute`, `omp.simd`, `omp.taskloop` and `omp.wsloop` wrapper operations. It is not intended to help with the addition of support for loop transformations, non-rectangular loops and non-perfectly nested loops. @@ -722,24 +722,19 @@ def WsloopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments, // Simd construct [2.9.3.1] //===----------------------------------------------------------------------===// -def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments, - AllTypesMatch<["lowerBound", "upperBound", "step"]>, - DeclareOpInterfaceMethods, - RecursiveMemoryEffects]> { - let summary = "simd loop construct"; +def SimdOp : OpenMP_Op<"simd", [AttrSizedOperandSegments, + DeclareOpInterfaceMethods, + RecursiveMemoryEffects, + SingleBlockImplicitTerminator<"TerminatorOp">]> { + let summary = "simd construct"; let description = [{ The simd construct can be applied to a loop to indicate that the loop can be transformed into a SIMD loop (that is, multiple iterations of the loop can - be executed concurrently using SIMD instructions).. The lower and upper - bounds specify a half-open range: the range includes the lower bound but - does not include the upper bound. If the `inclusive` attribute is specified - then the upper bound is also included. + be executed concurrently using SIMD instructions). - The body region can contain any number of blocks. The region is terminated - by "omp.yield" instruction without operands. - - Collapsed loops are represented by the simd-loop having a list of indices, - bounds and steps where the size of the list is equal to the collapse value. + The body region can contain a single block which must contain a single + operation and a terminator. The operation must be another compatible loop + wrapper or an `omp.loop_nest`. The `alignment_values` attribute additionally specifies alignment of each corresponding aligned operand. Note that `$aligned_vars` and @@ -763,32 +758,32 @@ def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments, SIMD chunk can have a distance in the logical iteration space that is greater than or equal to the value given in the clause. ``` - omp.simdloop - for (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) { - // block operations - omp.yield + omp.simd { + omp.loop_nest (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) { + %a = load %arrA[%i1, %i2] : memref + %b = load %arrB[%i1, %i2] : memref + %sum = arith.addf %a, %b : f32 + store %sum, %arrC[%i1, %i2] : memref + omp.yield + } } ``` }]; // TODO: Add other clauses - let arguments = (ins Variadic:$lowerBound, - Variadic:$upperBound, - Variadic:$step, - Variadic:$aligned_vars, + let arguments = (ins Variadic:$aligned_vars, OptionalAttr:$alignment_values, Optional:$if_expr, Variadic:$nontemporal_vars, OptionalAttr:$order_val, ConfinedAttr, [IntPositive]>:$simdlen, - ConfinedAttr, [IntPositive]>:$safelen, - UnitAttr:$inclusive + ConfinedAttr, [IntPositive]>:$safelen ); let regions = (region AnyRegion:$region); let builders = [ - OpBuilder<(ins CArg<"const SimdLoopClauseOps &">:$clauses)> + OpBuilder<(ins CArg<"const SimdClauseOps &">:$clauses)> ]; let assemblyFormat = [{ @@ -800,14 +795,7 @@ def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments, |`order` `(` custom($order_val) `)` |`simdlen` `(` $simdlen `)` |`safelen` `(` $safelen `)` - ) `for` custom($region, $lowerBound, $upperBound, $step, - type($step), $inclusive) attr-dict - }]; - - let extraClassDeclaration = [{ - /// Returns the number of loops in the simd loop nest. - unsigned getNumLoops() { return getLowerBound().size(); } - + ) $region attr-dict }]; let hasCustomAssemblyFormat = 1; @@ -818,7 +806,7 @@ def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments, def YieldOp : OpenMP_Op<"yield", [Pure, ReturnLike, Terminator, ParentOneOf<["LoopNestOp", "WsloopOp", "DeclareReductionOp", - "AtomicUpdateOp", "SimdLoopOp", "PrivateClauseOp"]>]> { + "AtomicUpdateOp", "PrivateClauseOp"]>]> { let summary = "loop yield and termination operation"; let description = [{ "omp.yield" yields SSA values from the OpenMP dialect op region and diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp index b9ada0fa0f979d..a206c7b228d21c 100644 --- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp +++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp @@ -251,11 +251,11 @@ void mlir::configureOpenMPToLLVMConversionLegality( }); target.addDynamicallyLegalOp< mlir::omp::AtomicUpdateOp, mlir::omp::CriticalOp, mlir::omp::TargetOp, - mlir::omp::TargetDataOp, mlir::omp::OrderedRegionOp, - mlir::omp::ParallelOp, mlir::omp::WsloopOp, mlir::omp::SimdLoopOp, - mlir::omp::MasterOp, mlir::omp::SectionOp, mlir::omp::SectionsOp, - mlir::omp::SingleOp, mlir::omp::TaskgroupOp, mlir::omp::TaskOp, - mlir::omp::DeclareReductionOp, + mlir::omp::TargetDataOp, mlir::omp::LoopNestOp, + mlir::omp::OrderedRegionOp, mlir::omp::ParallelOp, mlir::omp::WsloopOp, + mlir::omp::SimdOp, mlir::omp::MasterOp, mlir::omp::SectionOp, + mlir::omp::SectionsOp, mlir::omp::SingleOp, mlir::omp::TaskgroupOp, + mlir::omp::TaskOp, mlir::omp::DeclareReductionOp, mlir::omp::PrivateClauseOp>([&](Operation *op) { return std::all_of(op->getRegions().begin(), op->getRegions().end(), [&](Region ®ion) { @@ -278,11 +278,12 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter, AtomicReadOpConversion, MapInfoOpConversion, ReductionOpConversion, MultiRegionOpConversion, MultiRegionOpConversion, - RegionOpConversion, RegionOpConversion, - ReductionOpConversion, RegionOpConversion, + RegionOpConversion, RegionOpConversion, + RegionOpConversion, ReductionOpConversion, + RegionOpConversion, RegionOpConversion, RegionOpConversion, RegionOpConversion, RegionOpConversion, - RegionOpConversion, RegionOpConversion, + RegionOpConversion, RegionOpConversion, RegionOpConversion, RegionOpConversion, RegionOpConversion, RegionOpConversion, RegionLessOpWithVarOperandsConversion, diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index e500d0fca741fb..caf0ac3f860172 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1604,34 +1604,37 @@ void printLoopControl(OpAsmPrinter &p, Operation *op, Region ®ion, // Simd construct [2.9.3.1] //===----------------------------------------------------------------------===// -void SimdLoopOp::build(OpBuilder &builder, OperationState &state, - const SimdLoopClauseOps &clauses) { +void SimdOp::build(OpBuilder &builder, OperationState &state, + const SimdClauseOps &clauses) { MLIRContext *ctx = builder.getContext(); // TODO Store clauses in op: privateVars, reductionByRefAttr, reductionVars, // privatizers, reductionDeclSymbols. - SimdLoopOp::build( - builder, state, clauses.loopLBVar, clauses.loopUBVar, clauses.loopStepVar, - clauses.alignedVars, makeArrayAttr(ctx, clauses.alignmentAttrs), - clauses.ifVar, clauses.nontemporalVars, clauses.orderAttr, - clauses.simdlenAttr, clauses.safelenAttr, clauses.loopInclusiveAttr); + SimdOp::build(builder, state, clauses.alignedVars, + makeArrayAttr(ctx, clauses.alignmentAttrs), clauses.ifVar, + clauses.nontemporalVars, clauses.orderAttr, clauses.simdlenAttr, + clauses.safelenAttr); } -LogicalResult SimdLoopOp::verify() { - if (this->getLowerBound().empty()) { - return emitOpError() << "empty lowerbound for simd loop operation"; - } - if (this->getSimdlen().has_value() && this->getSafelen().has_value() && - this->getSimdlen().value() > this->getSafelen().value()) { +LogicalResult SimdOp::verify() { + if (getSimdlen().has_value() && getSafelen().has_value() && + getSimdlen().value() > getSafelen().value()) return emitOpError() << "simdlen clause and safelen clause are both present, but the " "simdlen value is not less than or equal to safelen value"; - } - if (verifyAlignedClause(*this, this->getAlignmentValues(), - this->getAlignedVars()) + + if (verifyAlignedClause(*this, getAlignmentValues(), getAlignedVars()) .failed()) return failure(); - if (verifyNontemporalClause(*this, this->getNontemporalVars()).failed()) + + if (verifyNontemporalClause(*this, getNontemporalVars()).failed()) return failure(); + + if (!isWrapper()) + return emitOpError() << "must be a loop wrapper"; + + if (getNestedWrapper()) + return emitOpError() << "must wrap an 'omp.loop_nest' directly"; + return success(); } @@ -1662,9 +1665,9 @@ LogicalResult DistributeOp::verify() { if (LoopWrapperInterface nested = getNestedWrapper()) { // Check for the allowed leaf constructs that may appear in a composite // construct directly after DISTRIBUTE. - if (!isa(nested)) + if (!isa(nested)) return emitError() << "only supported nested wrappers are 'omp.parallel' " - "and 'omp.simdloop'"; + "and 'omp.simd'"; } return success(); @@ -1876,8 +1879,8 @@ LogicalResult TaskloopOp::verify() { if (LoopWrapperInterface nested = getNestedWrapper()) { // Check for the allowed leaf constructs that may appear in a composite // construct directly after TASKLOOP. - if (!isa(nested)) - return emitError() << "only supported nested wrapper is 'omp.simdloop'"; + if (!isa(nested)) + return emitError() << "only supported nested wrapper is 'omp.simd'"; } return success(); } diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 300fc8ba56fc50..e89ff9209b034a 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -1406,9 +1406,10 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, /// Converts an OpenMP simd loop into LLVM IR using OpenMPIRBuilder. static LogicalResult -convertOmpSimdLoop(Operation &opInst, llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation) { - auto loop = cast(opInst); +convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + auto simdOp = cast(opInst); + auto loopOp = cast(simdOp.getWrappedLoop()); llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); @@ -1421,33 +1422,34 @@ convertOmpSimdLoop(Operation &opInst, llvm::IRBuilderBase &builder, auto bodyGen = [&](llvm::OpenMPIRBuilder::InsertPointTy ip, llvm::Value *iv) { // Make sure further conversions know about the induction variable. moduleTranslation.mapValue( - loop.getRegion().front().getArgument(loopInfos.size()), iv); + loopOp.getRegion().front().getArgument(loopInfos.size()), iv); // Capture the body insertion point for use in nested loops. BodyIP of the // CanonicalLoopInfo always points to the beginning of the entry block of // the body. bodyInsertPoints.push_back(ip); - if (loopInfos.size() != loop.getNumLoops() - 1) + if (loopInfos.size() != loopOp.getNumLoops() - 1) return; // Convert the body of the loop. builder.restoreIP(ip); - convertOmpOpRegions(loop.getRegion(), "omp.simdloop.region", builder, + convertOmpOpRegions(loopOp.getRegion(), "omp.simd.region", builder, moduleTranslation, bodyGenStatus); }; // Delegate actual loop construction to the OpenMP IRBuilder. - // TODO: this currently assumes SimdLoop is semantically similar to SCF loop, - // i.e. it has a positive step, uses signed integer semantics. Reconsider - // this code when SimdLoop clearly supports more cases. + // TODO: this currently assumes omp.loop_nest is semantically similar to SCF + // loop, i.e. it has a positive step, uses signed integer semantics. + // Reconsider this code when the nested loop operation clearly supports more + // cases. llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); - for (unsigned i = 0, e = loop.getNumLoops(); i < e; ++i) { + for (unsigned i = 0, e = loopOp.getNumLoops(); i < e; ++i) { llvm::Value *lowerBound = - moduleTranslation.lookupValue(loop.getLowerBound()[i]); + moduleTranslation.lookupValue(loopOp.getLowerBound()[i]); llvm::Value *upperBound = - moduleTranslation.lookupValue(loop.getUpperBound()[i]); - llvm::Value *step = moduleTranslation.lookupValue(loop.getStep()[i]); + moduleTranslation.lookupValue(loopOp.getUpperBound()[i]); + llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[i]); // Make sure loop trip count are emitted in the preheader of the outermost // loop at the latest so that they are all available for the new collapsed @@ -1473,18 +1475,18 @@ convertOmpSimdLoop(Operation &opInst, llvm::IRBuilderBase &builder, ompBuilder->collapseLoops(ompLoc.DL, loopInfos, {}); llvm::ConstantInt *simdlen = nullptr; - if (std::optional simdlenVar = loop.getSimdlen()) + if (std::optional simdlenVar = simdOp.getSimdlen()) simdlen = builder.getInt64(simdlenVar.value()); llvm::ConstantInt *safelen = nullptr; - if (std::optional safelenVar = loop.getSafelen()) + if (std::optional safelenVar = simdOp.getSafelen()) safelen = builder.getInt64(safelenVar.value()); llvm::MapVector alignedVars; ompBuilder->applySimd( loopInfo, alignedVars, - loop.getIfExpr() ? moduleTranslation.lookupValue(loop.getIfExpr()) - : nullptr, + simdOp.getIfExpr() ? moduleTranslation.lookupValue(simdOp.getIfExpr()) + : nullptr, llvm::omp::OrderKind::OMP_ORDER_unknown, simdlen, safelen); builder.restoreIP(afterIP); @@ -3198,8 +3200,8 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder, .Case([&](omp::WsloopOp) { return convertOmpWsloop(*op, builder, moduleTranslation); }) - .Case([&](omp::SimdLoopOp) { - return convertOmpSimdLoop(*op, builder, moduleTranslation); + .Case([&](omp::SimdOp) { + return convertOmpSimd(*op, builder, moduleTranslation); }) .Case([&](omp::AtomicReadOp) { return convertOmpAtomicRead(*op, builder, moduleTranslation); @@ -3421,7 +3423,6 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation( return convertTargetOpsInNest(op, builder, moduleTranslation); } } - return convertHostOrTargetOperation(op, builder, moduleTranslation); } diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir index dc5d6969ca7896..9f45d139b81f21 100644 --- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir +++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir @@ -145,9 +145,10 @@ func.func @threadprivate(%a: !llvm.ptr) -> () { // ----- -// CHECK: llvm.func @simdloop_block_arg(%[[LOWER:.*]]: i32, %[[UPPER:.*]]: i32, %[[ITER:.*]]: i64) { -// CHECK: omp.simdloop for (%[[ARG_0:.*]]) : i32 = -// CHECK-SAME: (%[[LOWER]]) to (%[[UPPER]]) inclusive step (%[[LOWER]]) { +// CHECK: llvm.func @loop_nest_block_arg(%[[LOWER:.*]]: i32, %[[UPPER:.*]]: i32, %[[ITER:.*]]: i64) { +// CHECK: omp.simd { +// CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]]) : i32 = (%[[LOWER]]) +// CHECK-SAME: to (%[[UPPER]]) inclusive step (%[[LOWER]]) { // CHECK: llvm.br ^[[BB1:.*]](%[[ITER]] : i64) // CHECK: ^[[BB1]](%[[VAL_0:.*]]: i64): // CHECK: %[[VAL_1:.*]] = llvm.icmp "slt" %[[VAL_0]], %[[ITER]] : i64 @@ -157,17 +158,19 @@ func.func @threadprivate(%a: !llvm.ptr) -> () { // CHECK: llvm.br ^[[BB1]](%[[VAL_2]] : i64) // CHECK: ^[[BB3]]: // CHECK: omp.yield -func.func @simdloop_block_arg(%val : i32, %ub : i32, %i : index) { - omp.simdloop for (%arg0) : i32 = (%val) to (%ub) inclusive step (%val) { - cf.br ^bb1(%i : index) - ^bb1(%0: index): - %1 = arith.cmpi slt, %0, %i : index - cf.cond_br %1, ^bb2, ^bb3 - ^bb2: - %2 = arith.addi %0, %i : index - cf.br ^bb1(%2 : index) - ^bb3: - omp.yield +func.func @loop_nest_block_arg(%val : i32, %ub : i32, %i : index) { + omp.simd { + omp.loop_nest (%arg0) : i32 = (%val) to (%ub) inclusive step (%val) { + cf.br ^bb1(%i : index) + ^bb1(%0: index): + %1 = arith.cmpi slt, %0, %i : index + cf.cond_br %1, ^bb2, ^bb3 + ^bb2: + %2 = arith.addi %0, %i : index + cf.br ^bb1(%2 : index) + ^bb3: + omp.yield + } } return } diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index 7f86a7f5b3182e..9323beadf45499 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -243,145 +243,168 @@ llvm.func @test_omp_wsloop_dynamic_wrong_modifier3(%lb : i64, %ub : i64, %step : // ----- -func.func @omp_simdloop(%lb : index, %ub : index, %step : i32) -> () { - // expected-error @below {{op failed to verify that all of {lowerBound, upperBound, step} have same type}} - "omp.simdloop" (%lb, %ub, %step) ({ - ^bb0(%iv: index): - omp.yield - }) {operandSegmentSizes = array} : - (index, index, i32) -> () +func.func @omp_simd() -> () { + // expected-error @below {{op must be a loop wrapper}} + omp.simd { + omp.terminator + } + return +} +// ----- + +func.func @omp_simd_nested_wrapper() -> () { + // expected-error @below {{op must wrap an 'omp.loop_nest' directly}} + omp.simd { + omp.distribute { + omp.terminator + } + } return } // ----- -func.func @omp_simdloop_pretty_aligned(%lb : index, %ub : index, %step : index, - %data_var : memref) -> () { +func.func @omp_simd_pretty_aligned(%lb : index, %ub : index, %step : index, + %data_var : memref) -> () { // expected-error @below {{expected '->'}} - omp.simdloop aligned(%data_var : memref) - for (%iv) : index = (%lb) to (%ub) step (%step) { - omp.yield + omp.simd aligned(%data_var : memref) { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { + omp.yield + } } return } // ----- -func.func @omp_simdloop_aligned_mismatch(%arg0 : index, %arg1 : index, - %arg2 : index, %arg3 : memref, - %arg4 : memref) -> () { +func.func @omp_simd_aligned_mismatch(%arg0 : index, %arg1 : index, + %arg2 : index, %arg3 : memref, + %arg4 : memref) -> () { // expected-error @below {{op expected as many alignment values as aligned variables}} - "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({ - ^bb0(%arg5: index): - "omp.yield"() : () -> () + "omp.simd"(%arg3, %arg4) ({ + omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) { + omp.yield + } }) {alignment_values = [128], - operandSegmentSizes = array} : (index, index, index, memref, memref) -> () + operandSegmentSizes = array} : (memref, memref) -> () return } // ----- -func.func @omp_simdloop_aligned_negative(%arg0 : index, %arg1 : index, - %arg2 : index, %arg3 : memref, - %arg4 : memref) -> () { +func.func @omp_simd_aligned_negative(%arg0 : index, %arg1 : index, + %arg2 : index, %arg3 : memref, + %arg4 : memref) -> () { // expected-error @below {{op alignment should be greater than 0}} - "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({ - ^bb0(%arg5: index): - "omp.yield"() : () -> () - }) {alignment_values = [-1, 128], operandSegmentSizes = array} : (index, index, index, memref, memref) -> () + "omp.simd"(%arg3, %arg4) ({ + omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) { + omp.yield + } + }) {alignment_values = [-1, 128], operandSegmentSizes = array} : (memref, memref) -> () return } // ----- -func.func @omp_simdloop_unexpected_alignment(%arg0 : index, %arg1 : index, - %arg2 : index, %arg3 : memref, - %arg4 : memref) -> () { +func.func @omp_simd_unexpected_alignment(%arg0 : index, %arg1 : index, + %arg2 : index, %arg3 : memref, + %arg4 : memref) -> () { // expected-error @below {{unexpected alignment values attribute}} - "omp.simdloop"(%arg0, %arg1, %arg2) ({ - ^bb0(%arg5: index): - "omp.yield"() : () -> () - }) {alignment_values = [1, 128], operandSegmentSizes = array} : (index, index, index) -> () + "omp.simd"() ({ + omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) { + omp.yield + } + }) {alignment_values = [1, 128]} : () -> () return } // ----- -func.func @omp_simdloop_aligned_float(%arg0 : index, %arg1 : index, - %arg2 : index, %arg3 : memref, - %arg4 : memref) -> () { +func.func @omp_simd_aligned_float(%arg0 : index, %arg1 : index, + %arg2 : index, %arg3 : memref, + %arg4 : memref) -> () { // expected-error @below {{failed to satisfy constraint: 64-bit integer array attribute}} - "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({ - ^bb0(%arg5: index): - "omp.yield"() : () -> () - }) {alignment_values = [1.5, 128], operandSegmentSizes = array} : (index, index, index, memref, memref) -> () + "omp.simd"(%arg3, %arg4) ({ + omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) { + omp.yield + } + }) {alignment_values = [1.5, 128], operandSegmentSizes = array} : (memref, memref) -> () return } // ----- -func.func @omp_simdloop_aligned_the_same_var(%arg0 : index, %arg1 : index, - %arg2 : index, %arg3 : memref, - %arg4 : memref) -> () { +func.func @omp_simd_aligned_the_same_var(%arg0 : index, %arg1 : index, + %arg2 : index, %arg3 : memref, + %arg4 : memref) -> () { // expected-error @below {{aligned variable used more than once}} - "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg3) ({ - ^bb0(%arg5: index): - "omp.yield"() : () -> () - }) {alignment_values = [1, 128], operandSegmentSizes = array} : (index, index, index, memref, memref) -> () + "omp.simd"(%arg3, %arg3) ({ + omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) { + omp.yield + } + }) {alignment_values = [1, 128], operandSegmentSizes = array} : (memref, memref) -> () return } // ----- -func.func @omp_simdloop_nontemporal_the_same_var(%arg0 : index, - %arg1 : index, - %arg2 : index, - %arg3 : memref) -> () { +func.func @omp_simd_nontemporal_the_same_var(%arg0 : index, %arg1 : index, + %arg2 : index, + %arg3 : memref) -> () { // expected-error @below {{nontemporal variable used more than once}} - "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg3) ({ - ^bb0(%arg5: index): - "omp.yield"() : () -> () - }) {operandSegmentSizes = array} : (index, index, index, memref, memref) -> () + "omp.simd"(%arg3, %arg3) ({ + omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) { + omp.yield + } + }) {operandSegmentSizes = array} : (memref, memref) -> () return } // ----- -func.func @omp_simdloop_order_value(%lb : index, %ub : index, %step : index) { +func.func @omp_simd_order_value(%lb : index, %ub : index, %step : index) { // expected-error @below {{invalid clause value: 'default'}} - omp.simdloop order(default) for (%iv): index = (%lb) to (%ub) step (%step) { - omp.yield + omp.simd order(default) { + omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) { + omp.yield + } } return } // ----- -func.func @omp_simdloop_pretty_simdlen(%lb : index, %ub : index, %step : index) -> () { +func.func @omp_simd_pretty_simdlen(%lb : index, %ub : index, %step : index) -> () { // expected-error @below {{op attribute 'simdlen' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive}} - omp.simdloop simdlen(0) for (%iv): index = (%lb) to (%ub) step (%step) { - omp.yield + omp.simd simdlen(0) { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { + omp.yield + } } return } // ----- -func.func @omp_simdloop_pretty_safelen(%lb : index, %ub : index, %step : index) -> () { +func.func @omp_simd_pretty_safelen(%lb : index, %ub : index, %step : index) -> () { // expected-error @below {{op attribute 'safelen' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive}} - omp.simdloop safelen(0) for (%iv): index = (%lb) to (%ub) step (%step) { - omp.yield + omp.simd safelen(0) { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { + omp.yield + } } return } // ----- -func.func @omp_simdloop_pretty_simdlen_safelen(%lb : index, %ub : index, %step : index) -> () { - // expected-error @below {{'omp.simdloop' op simdlen clause and safelen clause are both present, but the simdlen value is not less than or equal to safelen value}} - omp.simdloop simdlen(2) safelen(1) for (%iv): index = (%lb) to (%ub) step (%step) { - omp.yield +func.func @omp_simd_pretty_simdlen_safelen(%lb : index, %ub : index, %step : index) -> () { + // expected-error @below {{op simdlen clause and safelen clause are both present, but the simdlen value is not less than or equal to safelen value}} + omp.simd simdlen(2) safelen(1) { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { + omp.yield + } } return } @@ -1720,7 +1743,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { // ----- func.func @taskloop(%lb: i32, %ub: i32, %step: i32) { - // expected-error @below {{only supported nested wrapper is 'omp.simdloop'}} + // expected-error @below {{only supported nested wrapper is 'omp.simd'}} omp.taskloop { omp.distribute { omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { @@ -1927,7 +1950,7 @@ func.func @omp_distribute_wrapper() -> () { // ----- func.func @omp_distribute_nested_wrapper(%data_var : memref) -> () { - // expected-error @below {{only supported nested wrappers are 'omp.parallel' and 'omp.simdloop'}} + // expected-error @below {{only supported nested wrappers are 'omp.parallel' and 'omp.simd'}} "omp.distribute"() ({ "omp.wsloop"() ({ %0 = arith.constant 0 : i32 diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index 802e1795b3fffb..e2ca12afc14bd6 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -439,154 +439,161 @@ func.func @omp_wsloop_pretty_multiple(%lb1 : i32, %ub1 : i32, %step1 : i32, %lb2 return } -// CHECK-LABEL: omp_simdloop -func.func @omp_simdloop(%lb : index, %ub : index, %step : index) -> () { - // CHECK: omp.simdloop for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) - "omp.simdloop" (%lb, %ub, %step) ({ - ^bb0(%iv: index): - omp.yield - }) {operandSegmentSizes = array} : - (index, index, index) -> () +// CHECK-LABEL: omp_simd +func.func @omp_simd(%lb : index, %ub : index, %step : index) -> () { + // CHECK: omp.simd + "omp.simd" () ({ + "omp.loop_nest" (%lb, %ub, %step) ({ + ^bb1(%iv2: index): + "omp.yield"() : () -> () + }) : (index, index, index) -> () + "omp.terminator"() : () -> () + }) : () -> () return } -// CHECK-LABEL: omp_simdloop_aligned_list -func.func @omp_simdloop_aligned_list(%arg0 : index, %arg1 : index, %arg2 : index, - %arg3 : memref, %arg4 : memref) -> () { - // CHECK: omp.simdloop aligned(%{{.*}} : memref -> 32 : i64, +// CHECK-LABEL: omp_simd_aligned_list +func.func @omp_simd_aligned_list(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : memref, %arg4 : memref) -> () { + // CHECK: omp.simd aligned( + // CHECK-SAME: %{{.*}} : memref -> 32 : i64, // CHECK-SAME: %{{.*}} : memref -> 128 : i64) - // CHECK-SAME: for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { - "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({ - ^bb0(%arg5: index): + "omp.simd"(%arg3, %arg4) ({ + "omp.loop_nest" (%arg0, %arg1, %arg2) ({ + ^bb1(%iv2: index): "omp.yield"() : () -> () + }) : (index, index, index) -> () + "omp.terminator"() : () -> () }) {alignment_values = [32, 128], - operandSegmentSizes = array} : (index, index, index, memref, memref) -> () + operandSegmentSizes = array} : (memref, memref) -> () return } -// CHECK-LABEL: omp_simdloop_aligned_single -func.func @omp_simdloop_aligned_single(%arg0 : index, %arg1 : index, %arg2 : index, - %arg3 : memref, %arg4 : memref) -> () { - // CHECK: omp.simdloop aligned(%{{.*}} : memref -> 32 : i64) - // CHECK-SAME: for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { - "omp.simdloop"(%arg0, %arg1, %arg2, %arg3) ({ - ^bb0(%arg5: index): +// CHECK-LABEL: omp_simd_aligned_single +func.func @omp_simd_aligned_single(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : memref, %arg4 : memref) -> () { + // CHECK: omp.simd aligned(%{{.*}} : memref -> 32 : i64) + "omp.simd"(%arg3) ({ + "omp.loop_nest" (%arg0, %arg1, %arg2) ({ + ^bb1(%iv2: index): "omp.yield"() : () -> () + }) : (index, index, index) -> () + "omp.terminator"() : () -> () }) {alignment_values = [32], - operandSegmentSizes = array} : (index, index, index, memref) -> () + operandSegmentSizes = array} : (memref) -> () return } -// CHECK-LABEL: omp_simdloop_nontemporal_list -func.func @omp_simdloop_nontemporal_list(%arg0 : index, - %arg1 : index, - %arg2 : index, - %arg3 : memref, - %arg4 : memref) -> () { - // CHECK: omp.simdloop nontemporal(%{{.*}}, %{{.*}} : memref, memref) - // CHECK-SAME: for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { - "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({ - ^bb0(%arg5: index): +// CHECK-LABEL: omp_simd_nontemporal_list +func.func @omp_simd_nontemporal_list(%arg0 : index, %arg1 : index, + %arg2 : index, %arg3 : memref, + %arg4 : memref) -> () { + // CHECK: omp.simd nontemporal(%{{.*}}, %{{.*}} : memref, memref) + "omp.simd"(%arg3, %arg4) ({ + "omp.loop_nest" (%arg0, %arg1, %arg2) ({ + ^bb1(%iv2: index): "omp.yield"() : () -> () - }) {operandSegmentSizes = array} : (index, index, index, memref, memref) -> () + }) : (index, index, index) -> () + "omp.terminator"() : () -> () + }) {operandSegmentSizes = array} : (memref, memref) -> () return } -// CHECK-LABEL: omp_simdloop_nontemporal_single -func.func @omp_simdloop_nontemporal_single(%arg0 : index, - %arg1 : index, - %arg2 : index, - %arg3 : memref, - %arg4 : memref) -> () { - // CHECK: omp.simdloop nontemporal(%{{.*}} : memref) - // CHECK-SAME: for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { - "omp.simdloop"(%arg0, %arg1, %arg2, %arg3) ({ - ^bb0(%arg5: index): +// CHECK-LABEL: omp_simd_nontemporal_single +func.func @omp_simd_nontemporal_single(%arg0 : index, %arg1 : index, + %arg2 : index, %arg3 : memref, + %arg4 : memref) -> () { + // CHECK: omp.simd nontemporal(%{{.*}} : memref) + "omp.simd"(%arg3) ({ + "omp.loop_nest" (%arg0, %arg1, %arg2) ({ + ^bb1(%iv2: index): "omp.yield"() : () -> () - }) {operandSegmentSizes = array} : (index, index, index, memref) -> () + }) : (index, index, index) -> () + "omp.terminator"() : () -> () + }) {operandSegmentSizes = array} : (memref) -> () return } -// CHECK-LABEL: omp_simdloop_pretty -func.func @omp_simdloop_pretty(%lb : index, %ub : index, %step : index) -> () { - // CHECK: omp.simdloop for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) - omp.simdloop for (%iv) : index = (%lb) to (%ub) step (%step) { - omp.yield +// CHECK-LABEL: omp_simd_pretty +func.func @omp_simd_pretty(%lb : index, %ub : index, %step : index) -> () { + // CHECK: omp.simd { + omp.simd { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { + omp.yield + } } return } -// CHECK-LABEL: func.func @omp_simdloop_pretty_aligned( -func.func @omp_simdloop_pretty_aligned(%lb : index, %ub : index, %step : index, - %data_var : memref, - %data_var1 : memref) -> () { - // CHECK: omp.simdloop aligned(%{{.*}} : memref -> 32 : i64, +// CHECK-LABEL: func.func @omp_simd_pretty_aligned( +func.func @omp_simd_pretty_aligned(%lb : index, %ub : index, %step : index, + %data_var : memref, + %data_var1 : memref) -> () { + // CHECK: omp.simd aligned( + // CHECK-SAME: %{{.*}} : memref -> 32 : i64, // CHECK-SAME: %{{.*}} : memref -> 128 : i64) - // CHECK-SAME: for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { - omp.simdloop aligned(%data_var : memref -> 32, %data_var1 : memref -> 128) - for (%iv) : index = (%lb) to (%ub) step (%step) { + omp.simd aligned(%data_var : memref -> 32, %data_var1 : memref -> 128) { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { omp.yield + } } return } -// CHECK-LABEL: omp_simdloop_pretty_if -func.func @omp_simdloop_pretty_if(%lb : index, %ub : index, %step : index, %if_cond : i1) -> () { - // CHECK: omp.simdloop if(%{{.*}}) for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) - omp.simdloop if(%if_cond) for (%iv): index = (%lb) to (%ub) step (%step) { - omp.yield +// CHECK-LABEL: omp_simd_pretty_if +func.func @omp_simd_pretty_if(%lb : index, %ub : index, %step : index, %if_cond : i1) -> () { + // CHECK: omp.simd if(%{{.*}}) + omp.simd if(%if_cond) { + omp.loop_nest (%iv): index = (%lb) to (%ub) step (%step) { + omp.yield + } } return } -// CHECK-LABEL: func.func @omp_simdloop_pretty_nontemporal -func.func @omp_simdloop_pretty_nontemporal(%lb : index, - %ub : index, - %step : index, - %data_var : memref, - %data_var1 : memref) -> () { - // CHECK: omp.simdloop nontemporal(%{{.*}}, %{{.*}} : memref, memref) - // CHECK-SAME: for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { - omp.simdloop nontemporal(%data_var, %data_var1 : memref, memref) - for (%iv) : index = (%lb) to (%ub) step (%step) { +// CHECK-LABEL: func.func @omp_simd_pretty_nontemporal +func.func @omp_simd_pretty_nontemporal(%lb : index, %ub : index, %step : index, + %data_var : memref, + %data_var1 : memref) -> () { + // CHECK: omp.simd nontemporal(%{{.*}}, %{{.*}} : memref, memref) + omp.simd nontemporal(%data_var, %data_var1 : memref, memref) { + omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) { omp.yield - } - return -} -// CHECK-LABEL: omp_simdloop_pretty_order -func.func @omp_simdloop_pretty_order(%lb : index, %ub : index, %step : index) -> () { - // CHECK: omp.simdloop order(concurrent) - // CHECK-SAME: for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) - omp.simdloop order(concurrent) for (%iv): index = (%lb) to (%ub) step (%step) { - omp.yield + } } return } -// CHECK-LABEL: omp_simdloop_pretty_simdlen -func.func @omp_simdloop_pretty_simdlen(%lb : index, %ub : index, %step : index) -> () { - // CHECK: omp.simdloop simdlen(2) for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) - omp.simdloop simdlen(2) for (%iv): index = (%lb) to (%ub) step (%step) { - omp.yield +// CHECK-LABEL: omp_simd_pretty_order +func.func @omp_simd_pretty_order(%lb : index, %ub : index, %step : index) -> () { + // CHECK: omp.simd order(concurrent) + omp.simd order(concurrent) { + omp.loop_nest (%iv): index = (%lb) to (%ub) step (%step) { + omp.yield + } } return } -// CHECK-LABEL: omp_simdloop_pretty_safelen -func.func @omp_simdloop_pretty_safelen(%lb : index, %ub : index, %step : index) -> () { - // CHECK: omp.simdloop safelen(2) for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) - omp.simdloop safelen(2) for (%iv): index = (%lb) to (%ub) step (%step) { - omp.yield +// CHECK-LABEL: omp_simd_pretty_simdlen +func.func @omp_simd_pretty_simdlen(%lb : index, %ub : index, %step : index) -> () { + // CHECK: omp.simd simdlen(2) + omp.simd simdlen(2) { + omp.loop_nest (%iv): index = (%lb) to (%ub) step (%step) { + omp.yield + } } return } -// CHECK-LABEL: omp_simdloop_pretty_multiple -func.func @omp_simdloop_pretty_multiple(%lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> () { - // CHECK: omp.simdloop for (%{{.*}}, %{{.*}}) : index = (%{{.*}}, %{{.*}}) to (%{{.*}}, %{{.*}}) step (%{{.*}}, %{{.*}}) - omp.simdloop for (%iv1, %iv2) : index = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) { - omp.yield +// CHECK-LABEL: omp_simd_pretty_safelen +func.func @omp_simd_pretty_safelen(%lb : index, %ub : index, %step : index) -> () { + // CHECK: omp.simd safelen(2) + omp.simd safelen(2) { + omp.loop_nest (%iv): index = (%lb) to (%ub) step (%step) { + omp.yield + } } return } @@ -633,15 +640,13 @@ func.func @omp_distribute(%chunk_size : i32, %data_var : memref, %arg0 : i3 } // CHECK: omp.distribute omp.distribute { - // TODO Remove induction variables from omp.simdloop. - omp.simdloop for (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) { + omp.simd { omp.loop_nest (%iv2) : i32 = (%arg0) to (%arg0) step (%arg0) { omp.yield } - omp.yield } } -return + return } @@ -2170,14 +2175,11 @@ func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () { // CHECK: omp.taskloop { omp.taskloop { - // TODO Remove induction variables from omp.simdloop. - omp.simdloop for (%iv) : i32 = (%lb) to (%ub) step (%step) { + omp.simd { omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) { // CHECK: omp.yield omp.yield } - // CHECK: omp.yield - omp.yield } } diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir index 4cb99c1f1a285b..d1390022c1dc44 100644 --- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir +++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir @@ -638,10 +638,10 @@ llvm.func @test_omp_wsloop_guided_simd(%lb : i64, %ub : i64, %step : i64) -> () // ----- -// CHECK-LABEL: @simdloop_simple -llvm.func @simdloop_simple(%lb : i64, %ub : i64, %step : i64, %arg0: !llvm.ptr) { - "omp.simdloop" (%lb, %ub, %step) ({ - ^bb0(%iv: i64): +// CHECK-LABEL: @simd_simple +llvm.func @simd_simple(%lb : i64, %ub : i64, %step : i64, %arg0: !llvm.ptr) { + "omp.simd" () ({ + omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) { %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32 // The form of the emitted IR is controlled by OpenMPIRBuilder and // tested there. Just check that the right metadata is added. @@ -649,8 +649,9 @@ llvm.func @simdloop_simple(%lb : i64, %ub : i64, %step : i64, %arg0: !llvm.ptr) %4 = llvm.getelementptr %arg0[%iv] : (!llvm.ptr, i64) -> !llvm.ptr, f32 llvm.store %3, %4 : f32, !llvm.ptr omp.yield - }) {operandSegmentSizes = array} : - (i64, i64, i64) -> () + } + "omp.terminator"() : () -> () + }) : () -> () llvm.return } @@ -659,34 +660,36 @@ llvm.func @simdloop_simple(%lb : i64, %ub : i64, %step : i64, %arg0: !llvm.ptr) // ----- -// CHECK-LABEL: @simdloop_simple_multiple -llvm.func @simdloop_simple_multiple(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) { - omp.simdloop for (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) { - %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32 - // The form of the emitted IR is controlled by OpenMPIRBuilder and - // tested there. Just check that the right metadata is added and collapsed - // loop bound is generated (Collapse clause is represented as a loop with - // list of indices, bounds and steps where the size of the list is equal - // to the collapse value.) - // CHECK: icmp slt i64 - // CHECK-COUNT-3: select - // CHECK: %[[TRIPCOUNT0:.*]] = select - // CHECK: br label %[[PREHEADER:.*]] - // CHECK: [[PREHEADER]]: - // CHECK: icmp slt i64 - // CHECK-COUNT-3: select - // CHECK: %[[TRIPCOUNT1:.*]] = select - // CHECK: mul nuw i64 %[[TRIPCOUNT0]], %[[TRIPCOUNT1]] - // CHECK: br label %[[COLLAPSED_PREHEADER:.*]] - // CHECK: [[COLLAPSED_PREHEADER]]: - // CHECK: br label %[[COLLAPSED_HEADER:.*]] - // CHECK: llvm.access.group - // CHECK-NEXT: llvm.access.group - %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %3, %4 : f32, !llvm.ptr - llvm.store %3, %5 : f32, !llvm.ptr - omp.yield +// CHECK-LABEL: @simd_simple_multiple +llvm.func @simd_simple_multiple(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) { + omp.simd { + omp.loop_nest (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) { + %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32 + // The form of the emitted IR is controlled by OpenMPIRBuilder and + // tested there. Just check that the right metadata is added and collapsed + // loop bound is generated (Collapse clause is represented as a loop with + // list of indices, bounds and steps where the size of the list is equal + // to the collapse value.) + // CHECK: icmp slt i64 + // CHECK-COUNT-3: select + // CHECK: %[[TRIPCOUNT0:.*]] = select + // CHECK: br label %[[PREHEADER:.*]] + // CHECK: [[PREHEADER]]: + // CHECK: icmp slt i64 + // CHECK-COUNT-3: select + // CHECK: %[[TRIPCOUNT1:.*]] = select + // CHECK: mul nuw i64 %[[TRIPCOUNT0]], %[[TRIPCOUNT1]] + // CHECK: br label %[[COLLAPSED_PREHEADER:.*]] + // CHECK: [[COLLAPSED_PREHEADER]]: + // CHECK: br label %[[COLLAPSED_HEADER:.*]] + // CHECK: llvm.access.group + // CHECK-NEXT: llvm.access.group + %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %3, %4 : f32, !llvm.ptr + llvm.store %3, %5 : f32, !llvm.ptr + omp.yield + } } llvm.return } @@ -695,19 +698,21 @@ llvm.func @simdloop_simple_multiple(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : // ----- -// CHECK-LABEL: @simdloop_simple_multiple_simdlen -llvm.func @simdloop_simple_multiple_simdlen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) { - omp.simdloop simdlen(2) for (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) { - %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32 - // The form of the emitted IR is controlled by OpenMPIRBuilder and - // tested there. Just check that the right metadata is added. - // CHECK: llvm.access.group - // CHECK-NEXT: llvm.access.group - %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %3, %4 : f32, !llvm.ptr - llvm.store %3, %5 : f32, !llvm.ptr - omp.yield +// CHECK-LABEL: @simd_simple_multiple_simdlen +llvm.func @simd_simple_multiple_simdlen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) { + omp.simd simdlen(2) { + omp.loop_nest (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) { + %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32 + // The form of the emitted IR is controlled by OpenMPIRBuilder and + // tested there. Just check that the right metadata is added. + // CHECK: llvm.access.group + // CHECK-NEXT: llvm.access.group + %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %3, %4 : f32, !llvm.ptr + llvm.store %3, %5 : f32, !llvm.ptr + omp.yield + } } llvm.return } @@ -717,15 +722,17 @@ llvm.func @simdloop_simple_multiple_simdlen(%lb1 : i64, %ub1 : i64, %step1 : i64 // ----- -// CHECK-LABEL: @simdloop_simple_multiple_safelen -llvm.func @simdloop_simple_multiple_safelen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) { - omp.simdloop safelen(2) for (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) { - %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32 - %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %3, %4 : f32, !llvm.ptr - llvm.store %3, %5 : f32, !llvm.ptr - omp.yield +// CHECK-LABEL: @simd_simple_multiple_safelen +llvm.func @simd_simple_multiple_safelen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) { + omp.simd safelen(2) { + omp.loop_nest (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) { + %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32 + %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %3, %4 : f32, !llvm.ptr + llvm.store %3, %5 : f32, !llvm.ptr + omp.yield + } } llvm.return } @@ -734,15 +741,17 @@ llvm.func @simdloop_simple_multiple_safelen(%lb1 : i64, %ub1 : i64, %step1 : i64 // ----- -// CHECK-LABEL: @simdloop_simple_multiple_simdlen_safelen -llvm.func @simdloop_simple_multiple_simdlen_safelen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) { - omp.simdloop simdlen(1) safelen(2) for (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) { - %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32 - %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - llvm.store %3, %4 : f32, !llvm.ptr - llvm.store %3, %5 : f32, !llvm.ptr - omp.yield +// CHECK-LABEL: @simd_simple_multiple_simdlen_safelen +llvm.func @simd_simple_multiple_simdlen_safelen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) { + omp.simd simdlen(1) safelen(2) { + omp.loop_nest (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) { + %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32 + %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %3, %4 : f32, !llvm.ptr + llvm.store %3, %5 : f32, !llvm.ptr + omp.yield + } } llvm.return } @@ -751,8 +760,8 @@ llvm.func @simdloop_simple_multiple_simdlen_safelen(%lb1 : i64, %ub1 : i64, %ste // ----- -// CHECK-LABEL: @simdloop_if -llvm.func @simdloop_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr {fir.bindc_name = "threshold"}) { +// CHECK-LABEL: @simd_if +llvm.func @simd_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr {fir.bindc_name = "threshold"}) { %0 = llvm.mlir.constant(1 : i64) : i64 %1 = llvm.alloca %0 x i32 {adapt.valuebyref, in_type = i32, operandSegmentSizes = array} : (i64) -> !llvm.ptr %2 = llvm.mlir.constant(1 : i64) : i64 @@ -763,12 +772,14 @@ llvm.func @simdloop_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr %7 = llvm.load %arg0 : !llvm.ptr -> i32 %8 = llvm.load %arg1 : !llvm.ptr -> i32 %9 = llvm.icmp "sge" %7, %8 : i32 - omp.simdloop if(%9) for (%arg2) : i32 = (%4) to (%5) inclusive step (%6) { - // The form of the emitted IR is controlled by OpenMPIRBuilder and - // tested there. Just check that the right metadata is added. - // CHECK: llvm.access.group - llvm.store %arg2, %1 : i32, !llvm.ptr - omp.yield + omp.simd if(%9) { + omp.loop_nest (%arg2) : i32 = (%4) to (%5) inclusive step (%6) { + // The form of the emitted IR is controlled by OpenMPIRBuilder and + // tested there. Just check that the right metadata is added. + // CHECK: llvm.access.group + llvm.store %arg2, %1 : i32, !llvm.ptr + omp.yield + } } llvm.return } From 16b0be613205a37519e2bfec1a904ceaa89636e7 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Wed, 17 Apr 2024 11:30:11 +0100 Subject: [PATCH 234/300] [MLIR][OpenMP] NFC: Remove LoopControl parsing/printing code (#88909) This patch removes the LoopControl parsing/printing functions that are no longer used after transitioning `omp.simdloop` and `omp.taskloop` into loop wrapper operations. --- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 52 -------------------- 1 file changed, 52 deletions(-) diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index caf0ac3f860172..5d2281ce6094fd 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1548,58 +1548,6 @@ void printWsloop(OpAsmPrinter &p, Operation *op, Region ®ion, p.printRegion(region, /*printEntryBlockArgs=*/false); } -/// loop-control ::= `(` ssa-id-list `)` `:` type `=` loop-bounds -/// loop-bounds := `(` ssa-id-list `)` to `(` ssa-id-list `)` inclusive? steps -/// steps := `step` `(`ssa-id-list`)` -ParseResult -parseLoopControl(OpAsmParser &parser, Region ®ion, - SmallVectorImpl &lowerBound, - SmallVectorImpl &upperBound, - SmallVectorImpl &steps, - SmallVectorImpl &loopVarTypes, UnitAttr &inclusive) { - // Parse an opening `(` followed by induction variables followed by `)` - SmallVector ivs; - Type loopVarType; - if (parser.parseArgumentList(ivs, OpAsmParser::Delimiter::Paren) || - parser.parseColonType(loopVarType) || - // Parse loop bounds. - parser.parseEqual() || - parser.parseOperandList(lowerBound, ivs.size(), - OpAsmParser::Delimiter::Paren) || - parser.parseKeyword("to") || - parser.parseOperandList(upperBound, ivs.size(), - OpAsmParser::Delimiter::Paren)) - return failure(); - - if (succeeded(parser.parseOptionalKeyword("inclusive"))) - inclusive = UnitAttr::get(parser.getBuilder().getContext()); - - // Parse step values. - if (parser.parseKeyword("step") || - parser.parseOperandList(steps, ivs.size(), OpAsmParser::Delimiter::Paren)) - return failure(); - - // Now parse the body. - loopVarTypes = SmallVector(ivs.size(), loopVarType); - for (auto &iv : ivs) - iv.type = loopVarType; - - return parser.parseRegion(region, ivs); -} - -void printLoopControl(OpAsmPrinter &p, Operation *op, Region ®ion, - ValueRange lowerBound, ValueRange upperBound, - ValueRange steps, TypeRange loopVarTypes, - UnitAttr inclusive) { - auto args = region.front().getArguments(); - p << " (" << args << ") : " << args[0].getType() << " = (" << lowerBound - << ") to (" << upperBound << ") "; - if (inclusive) - p << "inclusive "; - p << "step (" << steps << ") "; - p.printRegion(region, /*printEntryBlockArgs=*/false); -} - //===----------------------------------------------------------------------===// // Simd construct [2.9.3.1] //===----------------------------------------------------------------------===// From a02019960b1a693320cd43b0ed6653d95877b94f Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 17 Apr 2024 18:15:26 +0800 Subject: [PATCH 235/300] [RISCV] Assert only valid AVLs in doLocalPostpass are X0 or virtual regs. NFC In vxrm.mir we were running RISCVInsertVSETVLI on pseudos that already had vsetvlis inserted and their AVLs set to $noreg. (This happened to work since doLocalPostpass got rid of the extra vsetvli) This removes the vsetvlis from the test and enforces that the only valid AVLs we work with are either X0 or virtual registers (or $noreg before emitVSETVLIs), since we don't handle physical registers properly in doLocalPostpass. --- llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 16 +++++++--------- llvm/test/CodeGen/RISCV/rvv/vxrm.mir | 8 ++++---- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index a54a1148cf28b9..6e45f0c703ceb8 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -468,6 +468,7 @@ class VSETVLIInfo { bool isUnknown() const { return State == Unknown; } void setAVLReg(Register Reg) { + assert(Reg.isVirtual() || Reg == RISCV::X0 || Reg == RISCV::NoRegister); AVLReg = Reg; State = AVLIsReg; } @@ -1514,12 +1515,9 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI, // If the AVL is a register, we need to make sure MI's AVL dominates PrevMI. // For now just check that PrevMI uses the same virtual register. - if (AVL.isReg() && AVL.getReg() != RISCV::X0) { - if (AVL.getReg().isPhysical()) - return false; - if (!PrevAVL.isReg() || PrevAVL.getReg() != AVL.getReg()) - return false; - } + if (AVL.isReg() && AVL.getReg() != RISCV::X0 && + (!PrevAVL.isReg() || PrevAVL.getReg() != AVL.getReg())) + return false; } assert(PrevMI.getOperand(2).isImm() && MI.getOperand(2).isImm()); @@ -1543,9 +1541,9 @@ void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) { continue; } - Register VRegDef = MI.getOperand(0).getReg(); - if (VRegDef != RISCV::X0 && - !(VRegDef.isVirtual() && MRI->use_nodbg_empty(VRegDef))) + Register RegDef = MI.getOperand(0).getReg(); + assert(RegDef == RISCV::X0 || RegDef.isVirtual()); + if (RegDef != RISCV::X0 && !MRI->use_nodbg_empty(RegDef)) Used.demandVL(); if (NextMI) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir index 64e191887e092c..a588677bec8e2f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir @@ -11,9 +11,9 @@ body: | ; MIR-LABEL: name: verify_vxrm ; MIR: liveins: $v8, $v9, $x10 ; MIR-NEXT: {{ $}} - ; MIR-NEXT: dead $x0 = PseudoVSETVLI renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype + ; MIR-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype ; MIR-NEXT: WriteVXRMImm 0, implicit-def $vxrm - ; MIR-NEXT: renamable $v8 = PseudoVAADD_VV_MF8 undef $v8, renamable $v8, renamable $v9, 0, $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype, implicit $vxrm + ; MIR-NEXT: renamable $v8 = PseudoVAADD_VV_MF8 undef $v8, killed renamable $v8, killed renamable $v9, 0, $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype, implicit $vxrm ; MIR-NEXT: PseudoRET implicit $v8 ; ASM-LABEL: verify_vxrm: ; ASM: # %bb.0: @@ -23,8 +23,8 @@ body: | ; ASM-NEXT: ret %0:vr = COPY $v8 %1:vr = COPY $v9 - dead $x0 = PseudoVSETVLI killed renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype + %2:gprnox0 = COPY $x10 %pt:vr = IMPLICIT_DEF - renamable $v8 = PseudoVAADD_VV_MF8 %pt, killed renamable $v8, killed renamable $v9, 0, $noreg, 3 /* e8 */, 0 + renamable $v8 = PseudoVAADD_VV_MF8 %pt, %0, %1, 0, %2, 3 /* e8 */, 0 PseudoRET implicit $v8 ... From a634f3ef39c0c547b87f1ee4ebe02ee3a256587f Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Wed, 17 Apr 2024 11:39:18 +0100 Subject: [PATCH 236/300] [RemoveDIs] Update update_test_checks script to recognize dbg_records (#87388) As we've added new IR elements for the RemoveDIs project, we need the update_test_checks script to understand them. For the records themselves this is already done automatically, but their metadata arguments are not recognized as such due to lacking the `metadata` prefix, which means they won't be checked by the script. This patch fixes this by adding a check for all `![0-9]+` patterns as long as they are not at the start of a line (which avoids matching global values). --- .../Inputs/various_ir_values_dbgrecords.ll | 168 ++++++++++ .../various_ir_values_dbgrecords.ll.expected | 238 ++++++++++++++ ...s_ir_values_dbgrecords.ll.funcsig.expected | 240 ++++++++++++++ ...ues_dbgrecords.ll.funcsig.globals.expected | 309 ++++++++++++++++++ ...s_dbgrecords.ll.funcsig.noglobals.expected | 238 ++++++++++++++ ...ords.ll.funcsig.transitiveglobals.expected | 299 +++++++++++++++++ .../various_ir_values_dbgrecords.test | 24 ++ llvm/utils/UpdateTestChecks/common.py | 2 +- 8 files changed, 1517 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.expected create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.noglobals.expected create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.transitiveglobals.expected create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll new file mode 100644 index 00000000000000..9a9cc0a06936f9 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll @@ -0,0 +1,168 @@ +; Just run it through opt, no passes needed. +; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s + +; ModuleID = 'various_ir_values.c' +source_filename = "various_ir_values.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define dso_local void @foo(ptr %A) #0 !dbg !7 { +entry: + %A.addr = alloca ptr, align 8, !DIAssignID !16 + %i = alloca i32, align 4 + #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17) + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22 + #dbg_declare(ptr %i, !14, !DIExpression(), !23) + store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24 + br label %for.cond, !dbg !22 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !30 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !34 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34 + store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24 + br label %for.inc, !dbg !34 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !37 + store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24 + br label %for.cond, !dbg !33, !llvm.loop !38 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !40 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nounwind uwtable +define dso_local void @bar(ptr %A) #0 !dbg !41 { +entry: + %A.addr = alloca ptr, align 8 + %i = alloca i32, align 4 + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47 + #dbg_declare(ptr %i, !44, !DIExpression(), !48) + store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24 + br label %for.cond, !dbg !47 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !53 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !56 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56 + store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24 + br label %for.inc, !dbg !56 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !59 + store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24 + br label %for.cond, !dbg !55, !llvm.loop !60 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !62 +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!12 = !{!13, !14} +!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11) +!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3) +!16 = distinct !DIAssignID() +!17 = !DILocation(line: 1, column: 15, scope: !7) +!18 = !{!19, !19, i64 0} +!19 = !{!"any pointer", !20, i64 0} +!20 = !{!"omnipotent char", !21, i64 0} +!21 = !{!"Simple C/C++ TBAA"} +!22 = !DILocation(line: 3, column: 8, scope: !15) +!23 = !DILocation(line: 3, column: 12, scope: !15) +!24 = !{!25, !25, i64 0} +!25 = !{!"int", !20, i64 0} +!26 = !DILocation(line: 3, column: 19, scope: !27) +!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3) +!28 = !DILocation(line: 3, column: 24, scope: !27) +!29 = !DILocation(line: 3, column: 23, scope: !27) +!30 = !DILocation(line: 3, column: 21, scope: !27) +!31 = !DILocation(line: 3, column: 3, scope: !15) +!32 = !{!"branch_weights", i32 1, i32 1048575} +!33 = !DILocation(line: 3, column: 3, scope: !27) +!34 = !DILocation(line: 4, column: 5, scope: !27) +!35 = !DILocation(line: 4, column: 7, scope: !27) +!36 = !DILocation(line: 4, column: 10, scope: !27) +!37 = !DILocation(line: 3, column: 27, scope: !27) +!38 = distinct !{!38, !31, !39} +!39 = !DILocation(line: 4, column: 12, scope: !15) +!40 = !DILocation(line: 5, column: 1, scope: !7) +!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42) +!42 = !{!43, !44} +!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10) +!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11) +!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3) +!46 = !DILocation(line: 7, column: 15, scope: !41) +!47 = !DILocation(line: 9, column: 8, scope: !45) +!48 = !DILocation(line: 9, column: 12, scope: !45) +!49 = !DILocation(line: 9, column: 19, scope: !50) +!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3) +!51 = !DILocation(line: 9, column: 24, scope: !50) +!52 = !DILocation(line: 9, column: 23, scope: !50) +!53 = !DILocation(line: 9, column: 21, scope: !50) +!54 = !DILocation(line: 9, column: 3, scope: !45) +!55 = !DILocation(line: 9, column: 3, scope: !50) +!56 = !DILocation(line: 10, column: 5, scope: !50) +!57 = !DILocation(line: 10, column: 7, scope: !50) +!58 = !DILocation(line: 10, column: 10, scope: !50) +!59 = !DILocation(line: 9, column: 27, scope: !50) +!60 = distinct !{!60, !54, !61} +!61 = !DILocation(line: 10, column: 12, scope: !45) +!62 = !DILocation(line: 11, column: 1, scope: !41) diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.expected new file mode 100644 index 00000000000000..1f9c37ccfbd889 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.expected @@ -0,0 +1,238 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; Just run it through opt, no passes needed. +; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s + +; ModuleID = 'various_ir_values.c' +source_filename = "various_ir_values.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define dso_local void @foo(ptr %A) #0 !dbg !7 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, !DIAssignID [[DIASSIGNID16:![0-9]+]] +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: #dbg_assign(i1 undef, [[META13:![0-9]+]], !DIExpression(), [[DIASSIGNID16]], ptr [[A_ADDR]], !DIExpression(), [[META17:![0-9]+]]) +; CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META13]], !DIExpression(), [[META17]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2:[0-9]+]], !dbg [[DBG22:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[I]], [[META14:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !dbg [[META23]], !tbaa [[TBAA24:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND:%.*]], !dbg [[DBG22]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG26:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG30:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG31:![0-9]+]], !prof [[PROF32:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG33:![0-9]+]] +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG34]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG34]] +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG36:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_INC:%.*]], !dbg [[DBG34]] +; CHECK: for.inc: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG37]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG37]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND]], !dbg [[DBG33]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void, !dbg [[DBG40:![0-9]+]] +; +entry: + %A.addr = alloca ptr, align 8, !DIAssignID !16 + %i = alloca i32, align 4 + #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17) + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22 + #dbg_declare(ptr %i, !14, !DIExpression(), !23) + store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24 + br label %for.cond, !dbg !22 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !30 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !34 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34 + store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24 + br label %for.inc, !dbg !34 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !37 + store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24 + br label %for.cond, !dbg !33, !llvm.loop !38 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !40 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nounwind uwtable +define dso_local void @bar(ptr %A) #0 !dbg !41 { +; CHECK-LABEL: @bar( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18]] +; CHECK-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META43:![0-9]+]], !DIExpression(), [[META46:![0-9]+]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG47:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[I]], [[META44:![0-9]+]], !DIExpression(), [[META48:![0-9]+]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !dbg [[META48]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND:%.*]], !dbg [[DBG47]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG51:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG53:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG54:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG55:![0-9]+]] +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG56:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG56]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG56]] +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_INC:%.*]], !dbg [[DBG56]] +; CHECK: for.inc: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG59]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG59]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND]], !dbg [[DBG55]], !llvm.loop [[LOOP60:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void, !dbg [[DBG62:![0-9]+]] +; +entry: + %A.addr = alloca ptr, align 8 + %i = alloca i32, align 4 + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47 + #dbg_declare(ptr %i, !44, !DIExpression(), !48) + store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24 + br label %for.cond, !dbg !47 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !53 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !56 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56 + store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24 + br label %for.inc, !dbg !56 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !59 + store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24 + br label %for.cond, !dbg !55, !llvm.loop !60 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !62 +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!12 = !{!13, !14} +!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11) +!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3) +!16 = distinct !DIAssignID() +!17 = !DILocation(line: 1, column: 15, scope: !7) +!18 = !{!19, !19, i64 0} +!19 = !{!"any pointer", !20, i64 0} +!20 = !{!"omnipotent char", !21, i64 0} +!21 = !{!"Simple C/C++ TBAA"} +!22 = !DILocation(line: 3, column: 8, scope: !15) +!23 = !DILocation(line: 3, column: 12, scope: !15) +!24 = !{!25, !25, i64 0} +!25 = !{!"int", !20, i64 0} +!26 = !DILocation(line: 3, column: 19, scope: !27) +!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3) +!28 = !DILocation(line: 3, column: 24, scope: !27) +!29 = !DILocation(line: 3, column: 23, scope: !27) +!30 = !DILocation(line: 3, column: 21, scope: !27) +!31 = !DILocation(line: 3, column: 3, scope: !15) +!32 = !{!"branch_weights", i32 1, i32 1048575} +!33 = !DILocation(line: 3, column: 3, scope: !27) +!34 = !DILocation(line: 4, column: 5, scope: !27) +!35 = !DILocation(line: 4, column: 7, scope: !27) +!36 = !DILocation(line: 4, column: 10, scope: !27) +!37 = !DILocation(line: 3, column: 27, scope: !27) +!38 = distinct !{!38, !31, !39} +!39 = !DILocation(line: 4, column: 12, scope: !15) +!40 = !DILocation(line: 5, column: 1, scope: !7) +!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42) +!42 = !{!43, !44} +!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10) +!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11) +!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3) +!46 = !DILocation(line: 7, column: 15, scope: !41) +!47 = !DILocation(line: 9, column: 8, scope: !45) +!48 = !DILocation(line: 9, column: 12, scope: !45) +!49 = !DILocation(line: 9, column: 19, scope: !50) +!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3) +!51 = !DILocation(line: 9, column: 24, scope: !50) +!52 = !DILocation(line: 9, column: 23, scope: !50) +!53 = !DILocation(line: 9, column: 21, scope: !50) +!54 = !DILocation(line: 9, column: 3, scope: !45) +!55 = !DILocation(line: 9, column: 3, scope: !50) +!56 = !DILocation(line: 10, column: 5, scope: !50) +!57 = !DILocation(line: 10, column: 7, scope: !50) +!58 = !DILocation(line: 10, column: 10, scope: !50) +!59 = !DILocation(line: 9, column: 27, scope: !50) +!60 = distinct !{!60, !54, !61} +!61 = !DILocation(line: 10, column: 12, scope: !45) +!62 = !DILocation(line: 11, column: 1, scope: !41) diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected new file mode 100644 index 00000000000000..5905e443deff2a --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected @@ -0,0 +1,240 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature +; Just run it through opt, no passes needed. +; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s + +; ModuleID = 'various_ir_values.c' +source_filename = "various_ir_values.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define dso_local void @foo(ptr %A) #0 !dbg !7 { +; CHECK-LABEL: define {{[^@]+}}@foo +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG7:![0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, !DIAssignID [[DIASSIGNID16:![0-9]+]] +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: #dbg_assign(i1 undef, [[META13:![0-9]+]], !DIExpression(), [[DIASSIGNID16]], ptr [[A_ADDR]], !DIExpression(), [[META17:![0-9]+]]) +; CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META13]], !DIExpression(), [[META17]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2:[0-9]+]], !dbg [[DBG22:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[I]], [[META14:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !dbg [[META23]], !tbaa [[TBAA24:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND:%.*]], !dbg [[DBG22]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG26:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG30:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG31:![0-9]+]], !prof [[PROF32:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG33:![0-9]+]] +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG34]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG34]] +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG36:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_INC:%.*]], !dbg [[DBG34]] +; CHECK: for.inc: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG37]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG37]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND]], !dbg [[DBG33]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void, !dbg [[DBG40:![0-9]+]] +; +entry: + %A.addr = alloca ptr, align 8, !DIAssignID !16 + %i = alloca i32, align 4 + #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17) + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22 + #dbg_declare(ptr %i, !14, !DIExpression(), !23) + store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24 + br label %for.cond, !dbg !22 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !30 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !34 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34 + store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24 + br label %for.inc, !dbg !34 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !37 + store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24 + br label %for.cond, !dbg !33, !llvm.loop !38 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !40 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nounwind uwtable +define dso_local void @bar(ptr %A) #0 !dbg !41 { +; CHECK-LABEL: define {{[^@]+}}@bar +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0]] !dbg [[DBG41:![0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18]] +; CHECK-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META43:![0-9]+]], !DIExpression(), [[META46:![0-9]+]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG47:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[I]], [[META44:![0-9]+]], !DIExpression(), [[META48:![0-9]+]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !dbg [[META48]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND:%.*]], !dbg [[DBG47]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG51:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG53:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG54:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG55:![0-9]+]] +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG56:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG56]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG56]] +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_INC:%.*]], !dbg [[DBG56]] +; CHECK: for.inc: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG59]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG59]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND]], !dbg [[DBG55]], !llvm.loop [[LOOP60:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void, !dbg [[DBG62:![0-9]+]] +; +entry: + %A.addr = alloca ptr, align 8 + %i = alloca i32, align 4 + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47 + #dbg_declare(ptr %i, !44, !DIExpression(), !48) + store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24 + br label %for.cond, !dbg !47 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !53 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !56 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56 + store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24 + br label %for.inc, !dbg !56 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !59 + store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24 + br label %for.cond, !dbg !55, !llvm.loop !60 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !62 +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!12 = !{!13, !14} +!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11) +!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3) +!16 = distinct !DIAssignID() +!17 = !DILocation(line: 1, column: 15, scope: !7) +!18 = !{!19, !19, i64 0} +!19 = !{!"any pointer", !20, i64 0} +!20 = !{!"omnipotent char", !21, i64 0} +!21 = !{!"Simple C/C++ TBAA"} +!22 = !DILocation(line: 3, column: 8, scope: !15) +!23 = !DILocation(line: 3, column: 12, scope: !15) +!24 = !{!25, !25, i64 0} +!25 = !{!"int", !20, i64 0} +!26 = !DILocation(line: 3, column: 19, scope: !27) +!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3) +!28 = !DILocation(line: 3, column: 24, scope: !27) +!29 = !DILocation(line: 3, column: 23, scope: !27) +!30 = !DILocation(line: 3, column: 21, scope: !27) +!31 = !DILocation(line: 3, column: 3, scope: !15) +!32 = !{!"branch_weights", i32 1, i32 1048575} +!33 = !DILocation(line: 3, column: 3, scope: !27) +!34 = !DILocation(line: 4, column: 5, scope: !27) +!35 = !DILocation(line: 4, column: 7, scope: !27) +!36 = !DILocation(line: 4, column: 10, scope: !27) +!37 = !DILocation(line: 3, column: 27, scope: !27) +!38 = distinct !{!38, !31, !39} +!39 = !DILocation(line: 4, column: 12, scope: !15) +!40 = !DILocation(line: 5, column: 1, scope: !7) +!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42) +!42 = !{!43, !44} +!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10) +!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11) +!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3) +!46 = !DILocation(line: 7, column: 15, scope: !41) +!47 = !DILocation(line: 9, column: 8, scope: !45) +!48 = !DILocation(line: 9, column: 12, scope: !45) +!49 = !DILocation(line: 9, column: 19, scope: !50) +!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3) +!51 = !DILocation(line: 9, column: 24, scope: !50) +!52 = !DILocation(line: 9, column: 23, scope: !50) +!53 = !DILocation(line: 9, column: 21, scope: !50) +!54 = !DILocation(line: 9, column: 3, scope: !45) +!55 = !DILocation(line: 9, column: 3, scope: !50) +!56 = !DILocation(line: 10, column: 5, scope: !50) +!57 = !DILocation(line: 10, column: 7, scope: !50) +!58 = !DILocation(line: 10, column: 10, scope: !50) +!59 = !DILocation(line: 9, column: 27, scope: !50) +!60 = distinct !{!60, !54, !61} +!61 = !DILocation(line: 10, column: 12, scope: !45) +!62 = !DILocation(line: 11, column: 1, scope: !41) diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected new file mode 100644 index 00000000000000..579d6a437d0e54 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected @@ -0,0 +1,309 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; Just run it through opt, no passes needed. +; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s + +; ModuleID = 'various_ir_values.c' +source_filename = "various_ir_values.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define dso_local void @foo(ptr %A) #0 !dbg !7 { +; CHECK-LABEL: define {{[^@]+}}@foo +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG7:![0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, !DIAssignID [[DIASSIGNID16:![0-9]+]] +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: #dbg_assign(i1 undef, [[META13:![0-9]+]], !DIExpression(), [[DIASSIGNID16]], ptr [[A_ADDR]], !DIExpression(), [[META17:![0-9]+]]) +; CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META13]], !DIExpression(), [[META17]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2:[0-9]+]], !dbg [[DBG22:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[I]], [[META14:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !dbg [[META23]], !tbaa [[TBAA24:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND:%.*]], !dbg [[DBG22]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG26:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG30:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG31:![0-9]+]], !prof [[PROF32:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG33:![0-9]+]] +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG34]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG34]] +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG36:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_INC:%.*]], !dbg [[DBG34]] +; CHECK: for.inc: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG37]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG37]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND]], !dbg [[DBG33]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void, !dbg [[DBG40:![0-9]+]] +; +entry: + %A.addr = alloca ptr, align 8, !DIAssignID !16 + %i = alloca i32, align 4 + #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17) + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22 + #dbg_declare(ptr %i, !14, !DIExpression(), !23) + store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24 + br label %for.cond, !dbg !22 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !30 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !34 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34 + store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24 + br label %for.inc, !dbg !34 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !37 + store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24 + br label %for.cond, !dbg !33, !llvm.loop !38 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !40 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nounwind uwtable +define dso_local void @bar(ptr %A) #0 !dbg !41 { +; CHECK-LABEL: define {{[^@]+}}@bar +; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0]] !dbg [[DBG41:![0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18]] +; CHECK-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META43:![0-9]+]], !DIExpression(), [[META46:![0-9]+]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG47:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[I]], [[META44:![0-9]+]], !DIExpression(), [[META48:![0-9]+]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !dbg [[META48]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND:%.*]], !dbg [[DBG47]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG51:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG53:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG54:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG55:![0-9]+]] +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG56:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG56]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG56]] +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_INC:%.*]], !dbg [[DBG56]] +; CHECK: for.inc: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG59]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG59]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND]], !dbg [[DBG55]], !llvm.loop [[LOOP60:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void, !dbg [[DBG62:![0-9]+]] +; +entry: + %A.addr = alloca ptr, align 8 + %i = alloca i32, align 4 + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47 + #dbg_declare(ptr %i, !44, !DIExpression(), !48) + store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24 + br label %for.cond, !dbg !47 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !53 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !56 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56 + store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24 + br label %for.inc, !dbg !56 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !59 + store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24 + br label %for.cond, !dbg !55, !llvm.loop !60 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !62 +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!12 = !{!13, !14} +!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11) +!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3) +!16 = distinct !DIAssignID() +!17 = !DILocation(line: 1, column: 15, scope: !7) +!18 = !{!19, !19, i64 0} +!19 = !{!"any pointer", !20, i64 0} +!20 = !{!"omnipotent char", !21, i64 0} +!21 = !{!"Simple C/C++ TBAA"} +!22 = !DILocation(line: 3, column: 8, scope: !15) +!23 = !DILocation(line: 3, column: 12, scope: !15) +!24 = !{!25, !25, i64 0} +!25 = !{!"int", !20, i64 0} +!26 = !DILocation(line: 3, column: 19, scope: !27) +!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3) +!28 = !DILocation(line: 3, column: 24, scope: !27) +!29 = !DILocation(line: 3, column: 23, scope: !27) +!30 = !DILocation(line: 3, column: 21, scope: !27) +!31 = !DILocation(line: 3, column: 3, scope: !15) +!32 = !{!"branch_weights", i32 1, i32 1048575} +!33 = !DILocation(line: 3, column: 3, scope: !27) +!34 = !DILocation(line: 4, column: 5, scope: !27) +!35 = !DILocation(line: 4, column: 7, scope: !27) +!36 = !DILocation(line: 4, column: 10, scope: !27) +!37 = !DILocation(line: 3, column: 27, scope: !27) +!38 = distinct !{!38, !31, !39} +!39 = !DILocation(line: 4, column: 12, scope: !15) +!40 = !DILocation(line: 5, column: 1, scope: !7) +!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42) +!42 = !{!43, !44} +!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10) +!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11) +!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3) +!46 = !DILocation(line: 7, column: 15, scope: !41) +!47 = !DILocation(line: 9, column: 8, scope: !45) +!48 = !DILocation(line: 9, column: 12, scope: !45) +!49 = !DILocation(line: 9, column: 19, scope: !50) +!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3) +!51 = !DILocation(line: 9, column: 24, scope: !50) +!52 = !DILocation(line: 9, column: 23, scope: !50) +!53 = !DILocation(line: 9, column: 21, scope: !50) +!54 = !DILocation(line: 9, column: 3, scope: !45) +!55 = !DILocation(line: 9, column: 3, scope: !50) +!56 = !DILocation(line: 10, column: 5, scope: !50) +!57 = !DILocation(line: 10, column: 7, scope: !50) +!58 = !DILocation(line: 10, column: 10, scope: !50) +!59 = !DILocation(line: 9, column: 27, scope: !50) +!60 = distinct !{!60, !54, !61} +!61 = !DILocation(line: 10, column: 12, scope: !45) +!62 = !DILocation(line: 11, column: 1, scope: !41) +;. +; CHECK: attributes #[[ATTR0]] = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +; CHECK: attributes #[[ATTR2]] = { nounwind } +;. +; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None) +; CHECK: [[META1]] = !DIFile(filename: "various_ir_values.c", directory: {{.*}}) +; CHECK: [[META2]] = !{} +; CHECK: [[META3:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4} +; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3} +; CHECK: [[META5:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +; CHECK: [[DBG7]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 1, type: [[META8:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META12:![0-9]+]]) +; CHECK: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; CHECK: [[META9]] = !{null, [[META10:![0-9]+]]} +; CHECK: [[META10]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META11:![0-9]+]], size: 64) +; CHECK: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; CHECK: [[META12]] = !{[[META13]], [[META14]]} +; CHECK: [[META13]] = !DILocalVariable(name: "A", arg: 1, scope: [[DBG7]], file: [[META1]], line: 1, type: [[META10]]) +; CHECK: [[META14]] = !DILocalVariable(name: "i", scope: [[META15:![0-9]+]], file: [[META1]], line: 3, type: [[META11]]) +; CHECK: [[META15]] = distinct !DILexicalBlock(scope: [[DBG7]], file: [[META1]], line: 3, column: 3) +; CHECK: [[DIASSIGNID16]] = distinct !DIAssignID() +; CHECK: [[META17]] = !DILocation(line: 1, column: 15, scope: [[DBG7]]) +; CHECK: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0} +; CHECK: [[META19]] = !{!"any pointer", [[META20:![0-9]+]], i64 0} +; CHECK: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0} +; CHECK: [[META21]] = !{!"Simple C/C++ TBAA"} +; CHECK: [[DBG22]] = !DILocation(line: 3, column: 8, scope: [[META15]]) +; CHECK: [[META23]] = !DILocation(line: 3, column: 12, scope: [[META15]]) +; CHECK: [[TBAA24]] = !{[[META25:![0-9]+]], [[META25]], i64 0} +; CHECK: [[META25]] = !{!"int", [[META20]], i64 0} +; CHECK: [[DBG26]] = !DILocation(line: 3, column: 19, scope: [[META27:![0-9]+]]) +; CHECK: [[META27]] = distinct !DILexicalBlock(scope: [[META15]], file: [[META1]], line: 3, column: 3) +; CHECK: [[DBG28]] = !DILocation(line: 3, column: 24, scope: [[META27]]) +; CHECK: [[DBG29]] = !DILocation(line: 3, column: 23, scope: [[META27]]) +; CHECK: [[DBG30]] = !DILocation(line: 3, column: 21, scope: [[META27]]) +; CHECK: [[DBG31]] = !DILocation(line: 3, column: 3, scope: [[META15]]) +; CHECK: [[PROF32]] = !{!"branch_weights", i32 1, i32 1048575} +; CHECK: [[DBG33]] = !DILocation(line: 3, column: 3, scope: [[META27]]) +; CHECK: [[DBG34]] = !DILocation(line: 4, column: 5, scope: [[META27]]) +; CHECK: [[DBG35]] = !DILocation(line: 4, column: 7, scope: [[META27]]) +; CHECK: [[DBG36]] = !DILocation(line: 4, column: 10, scope: [[META27]]) +; CHECK: [[DBG37]] = !DILocation(line: 3, column: 27, scope: [[META27]]) +; CHECK: [[LOOP38]] = distinct !{[[LOOP38]], [[DBG31]], [[META39:![0-9]+]]} +; CHECK: [[META39]] = !DILocation(line: 4, column: 12, scope: [[META15]]) +; CHECK: [[DBG40]] = !DILocation(line: 5, column: 1, scope: [[DBG7]]) +; CHECK: [[DBG41]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 7, type: [[META8]], scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META42:![0-9]+]]) +; CHECK: [[META42]] = !{[[META43]], [[META44]]} +; CHECK: [[META43]] = !DILocalVariable(name: "A", arg: 1, scope: [[DBG41]], file: [[META1]], line: 7, type: [[META10]]) +; CHECK: [[META44]] = !DILocalVariable(name: "i", scope: [[META45:![0-9]+]], file: [[META1]], line: 9, type: [[META11]]) +; CHECK: [[META45]] = distinct !DILexicalBlock(scope: [[DBG41]], file: [[META1]], line: 9, column: 3) +; CHECK: [[META46]] = !DILocation(line: 7, column: 15, scope: [[DBG41]]) +; CHECK: [[DBG47]] = !DILocation(line: 9, column: 8, scope: [[META45]]) +; CHECK: [[META48]] = !DILocation(line: 9, column: 12, scope: [[META45]]) +; CHECK: [[DBG49]] = !DILocation(line: 9, column: 19, scope: [[META50:![0-9]+]]) +; CHECK: [[META50]] = distinct !DILexicalBlock(scope: [[META45]], file: [[META1]], line: 9, column: 3) +; CHECK: [[DBG51]] = !DILocation(line: 9, column: 24, scope: [[META50]]) +; CHECK: [[DBG52]] = !DILocation(line: 9, column: 23, scope: [[META50]]) +; CHECK: [[DBG53]] = !DILocation(line: 9, column: 21, scope: [[META50]]) +; CHECK: [[DBG54]] = !DILocation(line: 9, column: 3, scope: [[META45]]) +; CHECK: [[DBG55]] = !DILocation(line: 9, column: 3, scope: [[META50]]) +; CHECK: [[DBG56]] = !DILocation(line: 10, column: 5, scope: [[META50]]) +; CHECK: [[DBG57]] = !DILocation(line: 10, column: 7, scope: [[META50]]) +; CHECK: [[DBG58]] = !DILocation(line: 10, column: 10, scope: [[META50]]) +; CHECK: [[DBG59]] = !DILocation(line: 9, column: 27, scope: [[META50]]) +; CHECK: [[LOOP60]] = distinct !{[[LOOP60]], [[DBG54]], [[META61:![0-9]+]]} +; CHECK: [[META61]] = !DILocation(line: 10, column: 12, scope: [[META45]]) +; CHECK: [[DBG62]] = !DILocation(line: 11, column: 1, scope: [[DBG41]]) +;. diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.noglobals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.noglobals.expected new file mode 100644 index 00000000000000..1f9c37ccfbd889 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.noglobals.expected @@ -0,0 +1,238 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; Just run it through opt, no passes needed. +; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s + +; ModuleID = 'various_ir_values.c' +source_filename = "various_ir_values.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define dso_local void @foo(ptr %A) #0 !dbg !7 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, !DIAssignID [[DIASSIGNID16:![0-9]+]] +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: #dbg_assign(i1 undef, [[META13:![0-9]+]], !DIExpression(), [[DIASSIGNID16]], ptr [[A_ADDR]], !DIExpression(), [[META17:![0-9]+]]) +; CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META13]], !DIExpression(), [[META17]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2:[0-9]+]], !dbg [[DBG22:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[I]], [[META14:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !dbg [[META23]], !tbaa [[TBAA24:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND:%.*]], !dbg [[DBG22]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG26:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG30:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG31:![0-9]+]], !prof [[PROF32:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG33:![0-9]+]] +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG34]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG34]] +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG36:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_INC:%.*]], !dbg [[DBG34]] +; CHECK: for.inc: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG37]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG37]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND]], !dbg [[DBG33]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void, !dbg [[DBG40:![0-9]+]] +; +entry: + %A.addr = alloca ptr, align 8, !DIAssignID !16 + %i = alloca i32, align 4 + #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17) + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22 + #dbg_declare(ptr %i, !14, !DIExpression(), !23) + store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24 + br label %for.cond, !dbg !22 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !30 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !34 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34 + store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24 + br label %for.inc, !dbg !34 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !37 + store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24 + br label %for.cond, !dbg !33, !llvm.loop !38 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !40 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nounwind uwtable +define dso_local void @bar(ptr %A) #0 !dbg !41 { +; CHECK-LABEL: @bar( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18]] +; CHECK-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META43:![0-9]+]], !DIExpression(), [[META46:![0-9]+]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG47:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[I]], [[META44:![0-9]+]], !DIExpression(), [[META48:![0-9]+]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !dbg [[META48]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND:%.*]], !dbg [[DBG47]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG51:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG53:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG54:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG55:![0-9]+]] +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG56:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG56]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG56]] +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_INC:%.*]], !dbg [[DBG56]] +; CHECK: for.inc: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG59]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG59]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND]], !dbg [[DBG55]], !llvm.loop [[LOOP60:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void, !dbg [[DBG62:![0-9]+]] +; +entry: + %A.addr = alloca ptr, align 8 + %i = alloca i32, align 4 + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47 + #dbg_declare(ptr %i, !44, !DIExpression(), !48) + store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24 + br label %for.cond, !dbg !47 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !53 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !56 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56 + store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24 + br label %for.inc, !dbg !56 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !59 + store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24 + br label %for.cond, !dbg !55, !llvm.loop !60 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !62 +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!12 = !{!13, !14} +!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11) +!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3) +!16 = distinct !DIAssignID() +!17 = !DILocation(line: 1, column: 15, scope: !7) +!18 = !{!19, !19, i64 0} +!19 = !{!"any pointer", !20, i64 0} +!20 = !{!"omnipotent char", !21, i64 0} +!21 = !{!"Simple C/C++ TBAA"} +!22 = !DILocation(line: 3, column: 8, scope: !15) +!23 = !DILocation(line: 3, column: 12, scope: !15) +!24 = !{!25, !25, i64 0} +!25 = !{!"int", !20, i64 0} +!26 = !DILocation(line: 3, column: 19, scope: !27) +!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3) +!28 = !DILocation(line: 3, column: 24, scope: !27) +!29 = !DILocation(line: 3, column: 23, scope: !27) +!30 = !DILocation(line: 3, column: 21, scope: !27) +!31 = !DILocation(line: 3, column: 3, scope: !15) +!32 = !{!"branch_weights", i32 1, i32 1048575} +!33 = !DILocation(line: 3, column: 3, scope: !27) +!34 = !DILocation(line: 4, column: 5, scope: !27) +!35 = !DILocation(line: 4, column: 7, scope: !27) +!36 = !DILocation(line: 4, column: 10, scope: !27) +!37 = !DILocation(line: 3, column: 27, scope: !27) +!38 = distinct !{!38, !31, !39} +!39 = !DILocation(line: 4, column: 12, scope: !15) +!40 = !DILocation(line: 5, column: 1, scope: !7) +!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42) +!42 = !{!43, !44} +!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10) +!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11) +!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3) +!46 = !DILocation(line: 7, column: 15, scope: !41) +!47 = !DILocation(line: 9, column: 8, scope: !45) +!48 = !DILocation(line: 9, column: 12, scope: !45) +!49 = !DILocation(line: 9, column: 19, scope: !50) +!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3) +!51 = !DILocation(line: 9, column: 24, scope: !50) +!52 = !DILocation(line: 9, column: 23, scope: !50) +!53 = !DILocation(line: 9, column: 21, scope: !50) +!54 = !DILocation(line: 9, column: 3, scope: !45) +!55 = !DILocation(line: 9, column: 3, scope: !50) +!56 = !DILocation(line: 10, column: 5, scope: !50) +!57 = !DILocation(line: 10, column: 7, scope: !50) +!58 = !DILocation(line: 10, column: 10, scope: !50) +!59 = !DILocation(line: 9, column: 27, scope: !50) +!60 = distinct !{!60, !54, !61} +!61 = !DILocation(line: 10, column: 12, scope: !45) +!62 = !DILocation(line: 11, column: 1, scope: !41) diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.transitiveglobals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.transitiveglobals.expected new file mode 100644 index 00000000000000..e2c426029a6b03 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.transitiveglobals.expected @@ -0,0 +1,299 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals smart +; Just run it through opt, no passes needed. +; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s + +; ModuleID = 'various_ir_values.c' +source_filename = "various_ir_values.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define dso_local void @foo(ptr %A) #0 !dbg !7 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, !DIAssignID [[DIASSIGNID16:![0-9]+]] +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: #dbg_assign(i1 undef, [[META13:![0-9]+]], !DIExpression(), [[DIASSIGNID16]], ptr [[A_ADDR]], !DIExpression(), [[META17:![0-9]+]]) +; CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META13]], !DIExpression(), [[META17]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2:[0-9]+]], !dbg [[DBG22:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[I]], [[META14:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !dbg [[META23]], !tbaa [[TBAA24:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND:%.*]], !dbg [[DBG22]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG26:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG30:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG31:![0-9]+]], !prof [[PROF32:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG33:![0-9]+]] +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG34]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG34]] +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG36:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_INC:%.*]], !dbg [[DBG34]] +; CHECK: for.inc: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG37]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG37]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND]], !dbg [[DBG33]], !llvm.loop [[LOOP38:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void, !dbg [[DBG40:![0-9]+]] +; +entry: + %A.addr = alloca ptr, align 8, !DIAssignID !16 + %i = alloca i32, align 4 + #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17) + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22 + #dbg_declare(ptr %i, !14, !DIExpression(), !23) + store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24 + br label %for.cond, !dbg !22 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !30 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !34 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34 + store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24 + br label %for.inc, !dbg !34 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !37 + store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24 + br label %for.cond, !dbg !33, !llvm.loop !38 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !40 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nounwind uwtable +define dso_local void @bar(ptr %A) #0 !dbg !41 { +; CHECK-LABEL: @bar( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18]] +; CHECK-NEXT: #dbg_declare(ptr [[A_ADDR]], [[META43:![0-9]+]], !DIExpression(), [[META46:![0-9]+]]) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG47:![0-9]+]] +; CHECK-NEXT: #dbg_declare(ptr [[I]], [[META44:![0-9]+]], !DIExpression(), [[META48:![0-9]+]]) +; CHECK-NEXT: store i32 0, ptr [[I]], align 4, !dbg [[META48]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND:%.*]], !dbg [[DBG47]] +; CHECK: for.cond: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG51:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG53:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG54:![0-9]+]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG55:![0-9]+]] +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG56:![0-9]+]], !tbaa [[TBAA18]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG56]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG56]] +; CHECK-NEXT: store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_INC:%.*]], !dbg [[DBG56]] +; CHECK: for.inc: +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA24]] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG59]] +; CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG59]], !tbaa [[TBAA24]] +; CHECK-NEXT: br label [[FOR_COND]], !dbg [[DBG55]], !llvm.loop [[LOOP60:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void, !dbg [[DBG62:![0-9]+]] +; +entry: + %A.addr = alloca ptr, align 8 + %i = alloca i32, align 4 + store ptr %A, ptr %A.addr, align 8, !tbaa !18 + #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46) + call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47 + #dbg_declare(ptr %i, !44, !DIExpression(), !48) + store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24 + br label %for.cond, !dbg !47 + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24 + %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18 + %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24 + %cmp = icmp slt i32 %0, %2, !dbg !53 + br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54 + +for.cond.cleanup: ; preds = %for.cond + call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55 + br label %for.end + +for.body: ; preds = %for.cond + %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18 + %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24 + %idxprom = sext i32 %4 to i64, !dbg !56 + %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56 + store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24 + br label %for.inc, !dbg !56 + +for.inc: ; preds = %for.body + %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24 + %inc = add nsw i32 %5, 1, !dbg !59 + store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24 + br label %for.cond, !dbg !55, !llvm.loop !60 + +for.end: ; preds = %for.cond.cleanup + ret void, !dbg !62 +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"} +!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!12 = !{!13, !14} +!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10) +!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11) +!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3) +!16 = distinct !DIAssignID() +!17 = !DILocation(line: 1, column: 15, scope: !7) +!18 = !{!19, !19, i64 0} +!19 = !{!"any pointer", !20, i64 0} +!20 = !{!"omnipotent char", !21, i64 0} +!21 = !{!"Simple C/C++ TBAA"} +!22 = !DILocation(line: 3, column: 8, scope: !15) +!23 = !DILocation(line: 3, column: 12, scope: !15) +!24 = !{!25, !25, i64 0} +!25 = !{!"int", !20, i64 0} +!26 = !DILocation(line: 3, column: 19, scope: !27) +!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3) +!28 = !DILocation(line: 3, column: 24, scope: !27) +!29 = !DILocation(line: 3, column: 23, scope: !27) +!30 = !DILocation(line: 3, column: 21, scope: !27) +!31 = !DILocation(line: 3, column: 3, scope: !15) +!32 = !{!"branch_weights", i32 1, i32 1048575} +!33 = !DILocation(line: 3, column: 3, scope: !27) +!34 = !DILocation(line: 4, column: 5, scope: !27) +!35 = !DILocation(line: 4, column: 7, scope: !27) +!36 = !DILocation(line: 4, column: 10, scope: !27) +!37 = !DILocation(line: 3, column: 27, scope: !27) +!38 = distinct !{!38, !31, !39} +!39 = !DILocation(line: 4, column: 12, scope: !15) +!40 = !DILocation(line: 5, column: 1, scope: !7) +!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42) +!42 = !{!43, !44} +!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10) +!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11) +!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3) +!46 = !DILocation(line: 7, column: 15, scope: !41) +!47 = !DILocation(line: 9, column: 8, scope: !45) +!48 = !DILocation(line: 9, column: 12, scope: !45) +!49 = !DILocation(line: 9, column: 19, scope: !50) +!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3) +!51 = !DILocation(line: 9, column: 24, scope: !50) +!52 = !DILocation(line: 9, column: 23, scope: !50) +!53 = !DILocation(line: 9, column: 21, scope: !50) +!54 = !DILocation(line: 9, column: 3, scope: !45) +!55 = !DILocation(line: 9, column: 3, scope: !50) +!56 = !DILocation(line: 10, column: 5, scope: !50) +!57 = !DILocation(line: 10, column: 7, scope: !50) +!58 = !DILocation(line: 10, column: 10, scope: !50) +!59 = !DILocation(line: 9, column: 27, scope: !50) +!60 = distinct !{!60, !54, !61} +!61 = !DILocation(line: 10, column: 12, scope: !45) +!62 = !DILocation(line: 11, column: 1, scope: !41) +;. +; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None) +; CHECK: [[META1]] = !DIFile(filename: "various_ir_values.c", directory: {{.*}}) +; CHECK: [[META2]] = !{} +; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 1, type: [[META8:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META12:![0-9]+]]) +; CHECK: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]]) +; CHECK: [[META9]] = !{null, [[META10:![0-9]+]]} +; CHECK: [[META10]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META11:![0-9]+]], size: 64) +; CHECK: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +; CHECK: [[META12]] = !{[[META13]], [[META14]]} +; CHECK: [[META13]] = !DILocalVariable(name: "A", arg: 1, scope: [[META7]], file: [[META1]], line: 1, type: [[META10]]) +; CHECK: [[META14]] = !DILocalVariable(name: "i", scope: [[META15:![0-9]+]], file: [[META1]], line: 3, type: [[META11]]) +; CHECK: [[META15]] = distinct !DILexicalBlock(scope: [[META7]], file: [[META1]], line: 3, column: 3) +; CHECK: [[DIASSIGNID16]] = distinct !DIAssignID() +; CHECK: [[META17]] = !DILocation(line: 1, column: 15, scope: [[META7]]) +; CHECK: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0} +; CHECK: [[META19]] = !{!"any pointer", [[META20:![0-9]+]], i64 0} +; CHECK: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0} +; CHECK: [[META21]] = !{!"Simple C/C++ TBAA"} +; CHECK: [[DBG22]] = !DILocation(line: 3, column: 8, scope: [[META15]]) +; CHECK: [[META23]] = !DILocation(line: 3, column: 12, scope: [[META15]]) +; CHECK: [[TBAA24]] = !{[[META25:![0-9]+]], [[META25]], i64 0} +; CHECK: [[META25]] = !{!"int", [[META20]], i64 0} +; CHECK: [[DBG26]] = !DILocation(line: 3, column: 19, scope: [[META27:![0-9]+]]) +; CHECK: [[META27]] = distinct !DILexicalBlock(scope: [[META15]], file: [[META1]], line: 3, column: 3) +; CHECK: [[DBG28]] = !DILocation(line: 3, column: 24, scope: [[META27]]) +; CHECK: [[DBG29]] = !DILocation(line: 3, column: 23, scope: [[META27]]) +; CHECK: [[DBG30]] = !DILocation(line: 3, column: 21, scope: [[META27]]) +; CHECK: [[DBG31]] = !DILocation(line: 3, column: 3, scope: [[META15]]) +; CHECK: [[PROF32]] = !{!"branch_weights", i32 1, i32 1048575} +; CHECK: [[DBG33]] = !DILocation(line: 3, column: 3, scope: [[META27]]) +; CHECK: [[DBG34]] = !DILocation(line: 4, column: 5, scope: [[META27]]) +; CHECK: [[DBG35]] = !DILocation(line: 4, column: 7, scope: [[META27]]) +; CHECK: [[DBG36]] = !DILocation(line: 4, column: 10, scope: [[META27]]) +; CHECK: [[DBG37]] = !DILocation(line: 3, column: 27, scope: [[META27]]) +; CHECK: [[LOOP38]] = distinct !{[[LOOP38]], [[DBG31]], [[META39:![0-9]+]]} +; CHECK: [[META39]] = !DILocation(line: 4, column: 12, scope: [[META15]]) +; CHECK: [[DBG40]] = !DILocation(line: 5, column: 1, scope: [[META7]]) +; CHECK: [[META41:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 7, type: [[META8]], scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META42:![0-9]+]]) +; CHECK: [[META42]] = !{[[META43]], [[META44]]} +; CHECK: [[META43]] = !DILocalVariable(name: "A", arg: 1, scope: [[META41]], file: [[META1]], line: 7, type: [[META10]]) +; CHECK: [[META44]] = !DILocalVariable(name: "i", scope: [[META45:![0-9]+]], file: [[META1]], line: 9, type: [[META11]]) +; CHECK: [[META45]] = distinct !DILexicalBlock(scope: [[META41]], file: [[META1]], line: 9, column: 3) +; CHECK: [[META46]] = !DILocation(line: 7, column: 15, scope: [[META41]]) +; CHECK: [[DBG47]] = !DILocation(line: 9, column: 8, scope: [[META45]]) +; CHECK: [[META48]] = !DILocation(line: 9, column: 12, scope: [[META45]]) +; CHECK: [[DBG49]] = !DILocation(line: 9, column: 19, scope: [[META50:![0-9]+]]) +; CHECK: [[META50]] = distinct !DILexicalBlock(scope: [[META45]], file: [[META1]], line: 9, column: 3) +; CHECK: [[DBG51]] = !DILocation(line: 9, column: 24, scope: [[META50]]) +; CHECK: [[DBG52]] = !DILocation(line: 9, column: 23, scope: [[META50]]) +; CHECK: [[DBG53]] = !DILocation(line: 9, column: 21, scope: [[META50]]) +; CHECK: [[DBG54]] = !DILocation(line: 9, column: 3, scope: [[META45]]) +; CHECK: [[DBG55]] = !DILocation(line: 9, column: 3, scope: [[META50]]) +; CHECK: [[DBG56]] = !DILocation(line: 10, column: 5, scope: [[META50]]) +; CHECK: [[DBG57]] = !DILocation(line: 10, column: 7, scope: [[META50]]) +; CHECK: [[DBG58]] = !DILocation(line: 10, column: 10, scope: [[META50]]) +; CHECK: [[DBG59]] = !DILocation(line: 9, column: 27, scope: [[META50]]) +; CHECK: [[LOOP60]] = distinct !{[[LOOP60]], [[DBG54]], [[META61:![0-9]+]]} +; CHECK: [[META61]] = !DILocation(line: 10, column: 12, scope: [[META45]]) +; CHECK: [[DBG62]] = !DILocation(line: 11, column: 1, scope: [[META41]]) +;. diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test new file mode 100644 index 00000000000000..9cc77d894d62c3 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test @@ -0,0 +1,24 @@ +## Basic test checking that update_test_checks.py works correctly on various "IR value" kinds +# RUN: cp -f %S/Inputs/various_ir_values_dbgrecords.ll %t.ll && %update_test_checks %t.ll +# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.expected +## Check that running the script again does not change the result: +# RUN: %update_test_checks %t.ll +# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.expected +## Also try the --function-signature flag +# RUN: %update_test_checks %t.ll --function-signature +# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected +## Verify that running without the --function-signature flag does not removes +## the -SAME: lines since the generated file will have --function-signature in +## an UTC_ARGS: comment in the first line (from the invocation above) which is +## added to the update invocation below. +# RUN: %update_test_checks %t.ll +# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected +## Also try the --check-globals flag +# RUN: %update_test_checks %t.ll --check-globals +# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected +# RUN: cp -f %S/Inputs/various_ir_values_dbgrecords.ll %t.ll && %update_test_checks %t.ll --function-signature --check-globals all +# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected +# RUN: cp -f %S/Inputs/various_ir_values_dbgrecords.ll %t.ll && %update_test_checks %t.ll --check-globals none +# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.noglobals.expected +# RUN: cp -f %S/Inputs/various_ir_values_dbgrecords.ll %t.ll && %update_test_checks %t.ll --check-globals smart +# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.transitiveglobals.expected diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index 15d3d5e527d61e..5595e6f417555d 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -1082,10 +1082,10 @@ def get_value_use(self, var, match, var_prefix=None): NamelessValue(r"TBAA_STRUCT", "!", r"!tbaa.struct ", r"![0-9]+", None), NamelessValue(r"RNG", "!", r"!range ", r"![0-9]+", None), NamelessValue(r"LOOP", "!", r"!llvm.loop ", r"![0-9]+", None), - NamelessValue(r"META", "!", r"metadata ", r"![0-9]+", None), NamelessValue(r"META", "!", r"", r"![0-9]+", r"(?:distinct |)!.*"), NamelessValue(r"ACC_GRP", "!", r"!llvm.access.group ", r"![0-9]+", None), NamelessValue(r"META", "!", r"![a-z.]+ ", r"![0-9]+", None), + NamelessValue(r"META", "!", r"[, (]", r"![0-9]+", None), ] global_nameless_values = [ From 792d437b56adfb3416daf8105942d4899fb82763 Mon Sep 17 00:00:00 2001 From: NagyDonat Date: Wed, 17 Apr 2024 12:52:23 +0200 Subject: [PATCH 237/300] [clang-tidy NFC] Fix a typo in docs for sizeof-expression (#88912) "Till heaven and earth pass, one jot, or one tittle shall not pass of the law" --- .../docs/clang-tidy/checks/bugprone/sizeof-expression.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst index a3e88b837d3758..c37df1706eb4e1 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst @@ -190,6 +190,6 @@ Options .. option:: WarnOnSizeOfPointerToAggregate - When `true, the check will warn on an expression like + When `true`, the check will warn on an expression like ``sizeof(expr)`` where the expression is a pointer to aggregate. Default is `true`. From 5f3e106de3cd5ce6d7ba37fb11f6ad740cb430c5 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Wed, 17 Apr 2024 13:05:15 +0200 Subject: [PATCH 238/300] [lldb/linux] Make sure the process continues running after a detach (#88494) Fixes #85084 Whenever an inferior thread stops, lldb-server sends a SIGSTOP to all other threads in the process to force them to stop as well. If those threads stop on their own before they get a signal, this SIGSTOP will remain pending and be delivered the next time the process resumes. Normally, this is not a problem, because lldb-server will detect this stale SIGSTOP and resume the process. However, if we detach from the process while it has these SIGSTOPs pending, they will get immediately delivered, and the process will remain stopped (most likely forever). This patch fixes that by sending a SIGCONT just before detaching from the process. This signal cancels out any pending SIGSTOPs, and ensures it is able to run after we detach. It does have one somewhat unfortunate side-effect that in that the process's SIGCONT handler (if it has one) will get executed spuriously (from the process's POV). This could be _sometimes_ avoided by tracking which threads got send a SIGSTOP, and whether those threads stopped due to it. From what I could tell by observing its behavior, this is what gdb does. I have not tried to replicate that behavior here because it adds a nontrivial amount of complexity and the result is still uncertain -- we still need to send a SIGCONT (and execute the handler) when any thread stops for some other reason (and leaves our SIGSTOP hanging). Furthermore, since SIGSTOPs don't stack, it's also possible that our SIGSTOP/SIGCONT combination will cancel a genuine SIGSTOP being sent to the debugger application (by someone else), and there is nothing we can do about that. For this reason I think it's simplest and most predictible to just always send a SIGCONT when detaching, but if it turns out this is breaking something, we can consider implementing something more elaborate. One alternative I did try is to use PTRACE_INTERRUPT to suspend the threads instead of a SIGSTOP. PTRACE_INTERUPT requires using PTRACE_SEIZE to attach to the process, which also made this solution somewhat complicated, but the main problem with that approach is that PTRACE_INTERRUPT is not considered to be a signal-delivery-stop, which means it's not possible to resume it while injecting another signal to the inferior (which some of our tests expect to be able to do). This limitation could be worked around by forcing the thread into a signal delivery stop whenever we need to do this, but this additional complication is what made me think this approach is also not worthwhile. This patch should fix (at least some of) the problems with TestConcurrentVFork, but I've also added a dedicated test for checking that a process keeps running after we detach. Although the problem I'm fixing here is linux-specific, the core functinoality of not stopping after a detach should function the same way everywhere. --- .../Process/Linux/NativeProcessLinux.cpp | 4 ++ .../commands/process/detach-resumes/Makefile | 4 ++ .../detach-resumes/TestDetachResumes.py | 59 +++++++++++++++++++ .../commands/process/detach-resumes/main.cpp | 48 +++++++++++++++ .../concurrent_vfork/TestConcurrentVFork.py | 16 ----- 5 files changed, 115 insertions(+), 16 deletions(-) create mode 100644 lldb/test/API/commands/process/detach-resumes/Makefile create mode 100644 lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py create mode 100644 lldb/test/API/commands/process/detach-resumes/main.cpp diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp index 5d2b4b03fe60cb..59fc8726b76739 100644 --- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp +++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp @@ -1089,6 +1089,10 @@ Status NativeProcessLinux::Detach() { if (GetID() == LLDB_INVALID_PROCESS_ID) return error; + // Cancel out any SIGSTOPs we may have sent while stopping the process. + // Otherwise, the process may stop as soon as we detach from it. + kill(GetID(), SIGCONT); + for (const auto &thread : m_threads) { Status e = Detach(thread->GetID()); if (e.Fail()) diff --git a/lldb/test/API/commands/process/detach-resumes/Makefile b/lldb/test/API/commands/process/detach-resumes/Makefile new file mode 100644 index 00000000000000..c46619c6623481 --- /dev/null +++ b/lldb/test/API/commands/process/detach-resumes/Makefile @@ -0,0 +1,4 @@ +CXX_SOURCES := main.cpp +ENABLE_THREADS := YES + +include Makefile.rules diff --git a/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py b/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py new file mode 100644 index 00000000000000..57727294ddc3d3 --- /dev/null +++ b/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py @@ -0,0 +1,59 @@ +""" +Test that the process continues running after we detach from it. +""" + +import lldb +import time +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class DetachResumesTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def test_detach_resumes(self): + self.build() + exe = self.getBuildArtifact() + + # The inferior will use this file to let us know it is ready to be + # attached. + sync_file_path = lldbutil.append_to_process_working_directory( + self, "sync_file_%d" % (int(time.time())) + ) + + # And this one to let us know it is running after we've detached from + # it. + exit_file_path = lldbutil.append_to_process_working_directory( + self, "exit_file_%d" % (int(time.time())) + ) + + popen = self.spawnSubprocess( + self.getBuildArtifact(exe), [sync_file_path, exit_file_path] + ) + lldbutil.wait_for_file_on_target(self, sync_file_path) + + self.runCmd("process attach -p " + str(popen.pid)) + + # Set a breakpoint at a place that will be called by multiple threads + # simultaneously. On systems (e.g. linux) where the debugger needs to + # send signals to suspend threads, these signals will race with threads + # hitting the breakpoint (and stopping on their own). + bpno = lldbutil.run_break_set_by_symbol(self, "break_here") + + # And let the inferior know it can call the function. + self.runCmd("expr -- wait_for_debugger_flag = false") + + self.runCmd("continue") + + self.expect( + "thread list", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stopped", "stop reason = breakpoint"], + ) + + # Detach, the process should keep running after this, and not be stopped + # by the signals that the debugger may have used to suspend the threads. + self.runCmd("detach") + + lldbutil.wait_for_file_on_target(self, exit_file_path) diff --git a/lldb/test/API/commands/process/detach-resumes/main.cpp b/lldb/test/API/commands/process/detach-resumes/main.cpp new file mode 100644 index 00000000000000..e8050fef2c3850 --- /dev/null +++ b/lldb/test/API/commands/process/detach-resumes/main.cpp @@ -0,0 +1,48 @@ +#include "pseudo_barrier.h" +#include +#include +#include +#include +#include +#include + +pseudo_barrier_t barrier; + +constexpr size_t nthreads = 5; +volatile bool wait_for_debugger_flag = true; + +void break_here() {} + +void tfunc() { + pseudo_barrier_wait(barrier); + + break_here(); +} + +int main(int argc, char const *argv[]) { + lldb_enable_attach(); + + if (argc < 3) + return 1; + + // Create a file to signal that this process has started up. + std::ofstream(argv[1]).close(); + + // And wait for it to attach. + for (int i = 0; i < 100 && wait_for_debugger_flag; ++i) + std::this_thread::sleep_for(std::chrono::seconds(1)); + + // Fire up the threads and have them call break_here() simultaneously. + pseudo_barrier_init(barrier, nthreads); + std::vector threads; + for (size_t i = 0; i < nthreads; ++i) + threads.emplace_back(tfunc); + + for (std::thread &t : threads) + t.join(); + + // Create the file to let the debugger know we're running. + std::ofstream(argv[2]).close(); + + return 0; +} diff --git a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py index 1790bd497f4e6b..2dcbb728549fb4 100644 --- a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py +++ b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py @@ -48,8 +48,6 @@ def follow_child_helper(self, use_fork, call_exec): self.expect("continue", patterns=[r"exited with status = 1[0-4]"]) @skipUnlessPlatform(["linux"]) - # See https://github.com/llvm/llvm-project/issues/85084. - @skipIf(oslist=["linux"], archs=["aarch64", "arm"]) def test_follow_parent_vfork_no_exec(self): """ Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent. @@ -58,8 +56,6 @@ def test_follow_parent_vfork_no_exec(self): self.follow_parent_helper(use_fork=False, call_exec=False) @skipUnlessPlatform(["linux"]) - # See https://github.com/llvm/llvm-project/issues/85084. - @skipIf(oslist=["linux"], archs=["aarch64", "arm"]) def test_follow_parent_fork_no_exec(self): """ Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-parent. @@ -68,8 +64,6 @@ def test_follow_parent_fork_no_exec(self): self.follow_parent_helper(use_fork=True, call_exec=False) @skipUnlessPlatform(["linux"]) - # See https://github.com/llvm/llvm-project/issues/85084. - @skipIf(oslist=["linux"], archs=["aarch64", "arm"]) def test_follow_parent_vfork_call_exec(self): """ Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent. @@ -78,8 +72,6 @@ def test_follow_parent_vfork_call_exec(self): self.follow_parent_helper(use_fork=False, call_exec=True) @skipUnlessPlatform(["linux"]) - # See https://github.com/llvm/llvm-project/issues/85084. - @skipIf(oslist=["linux"], archs=["aarch64", "arm"]) def test_follow_parent_fork_call_exec(self): """ Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent. @@ -88,8 +80,6 @@ def test_follow_parent_fork_call_exec(self): self.follow_parent_helper(use_fork=True, call_exec=True) @skipUnlessPlatform(["linux"]) - # See https://github.com/llvm/llvm-project/issues/85084. - @skipIf(oslist=["linux"], archs=["aarch64", "arm"]) def test_follow_child_vfork_no_exec(self): """ Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child. @@ -98,8 +88,6 @@ def test_follow_child_vfork_no_exec(self): self.follow_child_helper(use_fork=False, call_exec=False) @skipUnlessPlatform(["linux"]) - # See https://github.com/llvm/llvm-project/issues/85084. - @skipIf(oslist=["linux"], archs=["aarch64", "arm"]) def test_follow_child_fork_no_exec(self): """ Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child. @@ -108,8 +96,6 @@ def test_follow_child_fork_no_exec(self): self.follow_child_helper(use_fork=True, call_exec=False) @skipUnlessPlatform(["linux"]) - # See https://github.com/llvm/llvm-project/issues/85084. - @skipIf(oslist=["linux"], archs=["aarch64", "arm"]) def test_follow_child_vfork_call_exec(self): """ Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child. @@ -118,8 +104,6 @@ def test_follow_child_vfork_call_exec(self): self.follow_child_helper(use_fork=False, call_exec=True) @skipUnlessPlatform(["linux"]) - # See https://github.com/llvm/llvm-project/issues/85084. - @skipIf(oslist=["linux"], archs=["aarch64", "arm"]) def test_follow_child_fork_call_exec(self): """ Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child. From c8dca5bc0733e2fba81008fc33fcad1f45ba666a Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Wed, 17 Apr 2024 12:17:50 +0100 Subject: [PATCH 239/300] [Flang][OpenMP][Lower] Refactor lowering of compound constructs (#87070) This patch simplifies the lowering from PFT to MLIR of OpenMP compound constructs (i.e. combined and composite). The new approach consists of iteratively processing the outermost leaf construct of the given combined construct until it cannot be split further. Both leaf constructs and composite ones have `gen...()` functions that are called when appropriate. This approach enables treating a leaf construct the same way regardless of if it appeared as part of a combined construct, and it also enables the lowering of composite constructs as a single unit. Previous corner cases are now handled in a more straightforward way and comments pointing to the relevant spec section are added. Directive sets are also completed with missing LOOP related constructs. --- .../flang/Semantics/openmp-directive-sets.h | 57 ++- flang/lib/Lower/OpenMP/OpenMP.cpp | 430 ++++++++++++------ 2 files changed, 333 insertions(+), 154 deletions(-) diff --git a/flang/include/flang/Semantics/openmp-directive-sets.h b/flang/include/flang/Semantics/openmp-directive-sets.h index 91773ae3ea9a3e..842d251b682aa9 100644 --- a/flang/include/flang/Semantics/openmp-directive-sets.h +++ b/flang/include/flang/Semantics/openmp-directive-sets.h @@ -32,14 +32,14 @@ static const OmpDirectiveSet topDistributeSet{ static const OmpDirectiveSet allDistributeSet{ OmpDirectiveSet{ - llvm::omp::OMPD_target_teams_distribute, - llvm::omp::OMPD_target_teams_distribute_parallel_do, - llvm::omp::OMPD_target_teams_distribute_parallel_do_simd, - llvm::omp::OMPD_target_teams_distribute_simd, - llvm::omp::OMPD_teams_distribute, - llvm::omp::OMPD_teams_distribute_parallel_do, - llvm::omp::OMPD_teams_distribute_parallel_do_simd, - llvm::omp::OMPD_teams_distribute_simd, + Directive::OMPD_target_teams_distribute, + Directive::OMPD_target_teams_distribute_parallel_do, + Directive::OMPD_target_teams_distribute_parallel_do_simd, + Directive::OMPD_target_teams_distribute_simd, + Directive::OMPD_teams_distribute, + Directive::OMPD_teams_distribute_parallel_do, + Directive::OMPD_teams_distribute_parallel_do_simd, + Directive::OMPD_teams_distribute_simd, } | topDistributeSet, }; @@ -63,10 +63,24 @@ static const OmpDirectiveSet allDoSet{ } | topDoSet, }; +static const OmpDirectiveSet topLoopSet{ + Directive::OMPD_loop, +}; + +static const OmpDirectiveSet allLoopSet{ + OmpDirectiveSet{ + Directive::OMPD_parallel_loop, + Directive::OMPD_target_parallel_loop, + Directive::OMPD_target_teams_loop, + Directive::OMPD_teams_loop, + } | topLoopSet, +}; + static const OmpDirectiveSet topParallelSet{ Directive::OMPD_parallel, Directive::OMPD_parallel_do, Directive::OMPD_parallel_do_simd, + Directive::OMPD_parallel_loop, Directive::OMPD_parallel_masked_taskloop, Directive::OMPD_parallel_masked_taskloop_simd, Directive::OMPD_parallel_master_taskloop, @@ -82,6 +96,7 @@ static const OmpDirectiveSet allParallelSet{ Directive::OMPD_target_parallel, Directive::OMPD_target_parallel_do, Directive::OMPD_target_parallel_do_simd, + Directive::OMPD_target_parallel_loop, Directive::OMPD_target_teams_distribute_parallel_do, Directive::OMPD_target_teams_distribute_parallel_do_simd, Directive::OMPD_teams_distribute_parallel_do, @@ -118,12 +133,14 @@ static const OmpDirectiveSet topTargetSet{ Directive::OMPD_target_parallel, Directive::OMPD_target_parallel_do, Directive::OMPD_target_parallel_do_simd, + Directive::OMPD_target_parallel_loop, Directive::OMPD_target_simd, Directive::OMPD_target_teams, Directive::OMPD_target_teams_distribute, Directive::OMPD_target_teams_distribute_parallel_do, Directive::OMPD_target_teams_distribute_parallel_do_simd, Directive::OMPD_target_teams_distribute_simd, + Directive::OMPD_target_teams_loop, }; static const OmpDirectiveSet allTargetSet{topTargetSet}; @@ -156,11 +173,12 @@ static const OmpDirectiveSet topTeamsSet{ static const OmpDirectiveSet allTeamsSet{ OmpDirectiveSet{ - llvm::omp::OMPD_target_teams, - llvm::omp::OMPD_target_teams_distribute, - llvm::omp::OMPD_target_teams_distribute_parallel_do, - llvm::omp::OMPD_target_teams_distribute_parallel_do_simd, - llvm::omp::OMPD_target_teams_distribute_simd, + Directive::OMPD_target_teams, + Directive::OMPD_target_teams_distribute, + Directive::OMPD_target_teams_distribute_parallel_do, + Directive::OMPD_target_teams_distribute_parallel_do_simd, + Directive::OMPD_target_teams_distribute_simd, + Directive::OMPD_target_teams_loop, } | topTeamsSet, }; @@ -178,6 +196,14 @@ static const OmpDirectiveSet allDistributeSimdSet{ static const OmpDirectiveSet allDoSimdSet{allDoSet & allSimdSet}; static const OmpDirectiveSet allTaskloopSimdSet{allTaskloopSet & allSimdSet}; +static const OmpDirectiveSet compositeConstructSet{ + Directive::OMPD_distribute_parallel_do, + Directive::OMPD_distribute_parallel_do_simd, + Directive::OMPD_distribute_simd, + Directive::OMPD_do_simd, + Directive::OMPD_taskloop_simd, +}; + static const OmpDirectiveSet blockConstructSet{ Directive::OMPD_master, Directive::OMPD_ordered, @@ -201,12 +227,14 @@ static const OmpDirectiveSet loopConstructSet{ Directive::OMPD_distribute_simd, Directive::OMPD_do, Directive::OMPD_do_simd, + Directive::OMPD_loop, Directive::OMPD_masked_taskloop, Directive::OMPD_masked_taskloop_simd, Directive::OMPD_master_taskloop, Directive::OMPD_master_taskloop_simd, Directive::OMPD_parallel_do, Directive::OMPD_parallel_do_simd, + Directive::OMPD_parallel_loop, Directive::OMPD_parallel_masked_taskloop, Directive::OMPD_parallel_masked_taskloop_simd, Directive::OMPD_parallel_master_taskloop, @@ -214,17 +242,20 @@ static const OmpDirectiveSet loopConstructSet{ Directive::OMPD_simd, Directive::OMPD_target_parallel_do, Directive::OMPD_target_parallel_do_simd, + Directive::OMPD_target_parallel_loop, Directive::OMPD_target_simd, Directive::OMPD_target_teams_distribute, Directive::OMPD_target_teams_distribute_parallel_do, Directive::OMPD_target_teams_distribute_parallel_do_simd, Directive::OMPD_target_teams_distribute_simd, + Directive::OMPD_target_teams_loop, Directive::OMPD_taskloop, Directive::OMPD_taskloop_simd, Directive::OMPD_teams_distribute, Directive::OMPD_teams_distribute_parallel_do, Directive::OMPD_teams_distribute_parallel_do_simd, Directive::OMPD_teams_distribute_simd, + Directive::OMPD_teams_loop, Directive::OMPD_tile, Directive::OMPD_unroll, }; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index c31d63625dbb17..bb38082b245ef5 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -488,6 +488,81 @@ markDeclareTarget(mlir::Operation *op, declareTargetOp.setDeclareTarget(deviceType, captureClause); } +/// Split a combined directive into an outer leaf directive and the (possibly +/// combined) rest of the combined directive. Composite directives and +/// non-compound directives are not split, in which case it will return the +/// input directive as its first output and an empty value as its second output. +static std::pair> +splitCombinedDirective(llvm::omp::Directive dir) { + using D = llvm::omp::Directive; + switch (dir) { + case D::OMPD_masked_taskloop: + return {D::OMPD_masked, D::OMPD_taskloop}; + case D::OMPD_masked_taskloop_simd: + return {D::OMPD_masked, D::OMPD_taskloop_simd}; + case D::OMPD_master_taskloop: + return {D::OMPD_master, D::OMPD_taskloop}; + case D::OMPD_master_taskloop_simd: + return {D::OMPD_master, D::OMPD_taskloop_simd}; + case D::OMPD_parallel_do: + return {D::OMPD_parallel, D::OMPD_do}; + case D::OMPD_parallel_do_simd: + return {D::OMPD_parallel, D::OMPD_do_simd}; + case D::OMPD_parallel_masked: + return {D::OMPD_parallel, D::OMPD_masked}; + case D::OMPD_parallel_masked_taskloop: + return {D::OMPD_parallel, D::OMPD_masked_taskloop}; + case D::OMPD_parallel_masked_taskloop_simd: + return {D::OMPD_parallel, D::OMPD_masked_taskloop_simd}; + case D::OMPD_parallel_master: + return {D::OMPD_parallel, D::OMPD_master}; + case D::OMPD_parallel_master_taskloop: + return {D::OMPD_parallel, D::OMPD_master_taskloop}; + case D::OMPD_parallel_master_taskloop_simd: + return {D::OMPD_parallel, D::OMPD_master_taskloop_simd}; + case D::OMPD_parallel_sections: + return {D::OMPD_parallel, D::OMPD_sections}; + case D::OMPD_parallel_workshare: + return {D::OMPD_parallel, D::OMPD_workshare}; + case D::OMPD_target_parallel: + return {D::OMPD_target, D::OMPD_parallel}; + case D::OMPD_target_parallel_do: + return {D::OMPD_target, D::OMPD_parallel_do}; + case D::OMPD_target_parallel_do_simd: + return {D::OMPD_target, D::OMPD_parallel_do_simd}; + case D::OMPD_target_simd: + return {D::OMPD_target, D::OMPD_simd}; + case D::OMPD_target_teams: + return {D::OMPD_target, D::OMPD_teams}; + case D::OMPD_target_teams_distribute: + return {D::OMPD_target, D::OMPD_teams_distribute}; + case D::OMPD_target_teams_distribute_parallel_do: + return {D::OMPD_target, D::OMPD_teams_distribute_parallel_do}; + case D::OMPD_target_teams_distribute_parallel_do_simd: + return {D::OMPD_target, D::OMPD_teams_distribute_parallel_do_simd}; + case D::OMPD_target_teams_distribute_simd: + return {D::OMPD_target, D::OMPD_teams_distribute_simd}; + case D::OMPD_teams_distribute: + return {D::OMPD_teams, D::OMPD_distribute}; + case D::OMPD_teams_distribute_parallel_do: + return {D::OMPD_teams, D::OMPD_distribute_parallel_do}; + case D::OMPD_teams_distribute_parallel_do_simd: + return {D::OMPD_teams, D::OMPD_distribute_parallel_do_simd}; + case D::OMPD_teams_distribute_simd: + return {D::OMPD_teams, D::OMPD_distribute_simd}; + case D::OMPD_parallel_loop: + return {D::OMPD_parallel, D::OMPD_loop}; + case D::OMPD_target_parallel_loop: + return {D::OMPD_target, D::OMPD_parallel_loop}; + case D::OMPD_target_teams_loop: + return {D::OMPD_target, D::OMPD_teams_loop}; + case D::OMPD_teams_loop: + return {D::OMPD_teams, D::OMPD_loop}; + default: + return {dir, std::nullopt}; + } +} + //===----------------------------------------------------------------------===// // Op body generation helper structures and functions //===----------------------------------------------------------------------===// @@ -1676,7 +1751,6 @@ static OpTy genTargetEnterExitUpdateDataOp( } else { llvm_unreachable("Unexpected TARGET DATA construct"); } - mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps; genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx, clauseList, loc, directive, clauseOps); @@ -1804,16 +1878,44 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter, // Code generation functions for composite constructs //===----------------------------------------------------------------------===// -static void genCompositeDoSimd( +static void genCompositeDistributeParallelDo( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OmpClauseList &beginClauseList, + const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) { + TODO(loc, "Composite DISTRIBUTE PARALLEL DO"); +} + +static void genCompositeDistributeParallelDoSimd( + Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OmpClauseList &beginClauseList, + const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) { + TODO(loc, "Composite DISTRIBUTE PARALLEL DO SIMD"); +} + +static void genCompositeDistributeSimd( Fortran::lower::AbstractConverter &converter, Fortran::semantics::SemanticsContext &semaCtx, - Fortran::lower::pft::Evaluation &eval, llvm::omp::Directive ompDirective, + Fortran::lower::pft::Evaluation &eval, const Fortran::parser::OmpClauseList &beginClauseList, const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) { + TODO(loc, "Composite DISTRIBUTE SIMD"); +} + +static void +genCompositeDoSimd(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OmpClauseList &beginClauseList, + const Fortran::parser::OmpClauseList *endClauseList, + mlir::Location loc) { ClauseProcessor cp(converter, semaCtx, beginClauseList); cp.processTODO(loc, - ompDirective); + clause::Order, clause::Safelen, clause::Simdlen>( + loc, llvm::omp::OMPD_do_simd); // TODO: Add support for vectorization - add vectorization hints inside loop // body. // OpenMP standard does not specify the length of vector instructions. @@ -1825,6 +1927,16 @@ static void genCompositeDoSimd( genWsloopOp(converter, semaCtx, eval, loc, beginClauseList, endClauseList); } +static void +genCompositeTaskloopSimd(Fortran::lower::AbstractConverter &converter, + Fortran::semantics::SemanticsContext &semaCtx, + Fortran::lower::pft::Evaluation &eval, + const Fortran::parser::OmpClauseList &beginClauseList, + const Fortran::parser::OmpClauseList *endClauseList, + mlir::Location loc) { + TODO(loc, "Composite TASKLOOP SIMD"); +} + //===----------------------------------------------------------------------===// // OpenMPDeclarativeConstruct visitors //===----------------------------------------------------------------------===// @@ -2082,13 +2194,18 @@ genOMP(Fortran::lower::AbstractConverter &converter, std::get(blockConstruct.t); const auto &endBlockDirective = std::get(blockConstruct.t); - const auto &directive = - std::get(beginBlockDirective.t); + mlir::Location currentLocation = + converter.genLocation(beginBlockDirective.source); + const auto origDirective = + std::get(beginBlockDirective.t).v; const auto &beginClauseList = std::get(beginBlockDirective.t); const auto &endClauseList = std::get(endBlockDirective.t); + assert(llvm::omp::blockConstructSet.test(origDirective) && + "Expected block construct"); + for (const Fortran::parser::OmpClause &clause : beginClauseList.v) { mlir::Location clauseLocation = converter.genLocation(clause.source); if (!std::get_if(&clause.u) && @@ -2124,93 +2241,74 @@ genOMP(Fortran::lower::AbstractConverter &converter, TODO(clauseLocation, "OpenMP Block construct clause"); } - bool singleDirective = true; - mlir::Location currentLocation = converter.genLocation(directive.source); - switch (directive.v) { - case llvm::omp::Directive::OMPD_master: - genMasterOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation); - break; - case llvm::omp::Directive::OMPD_ordered: - genOrderedRegionOp(converter, semaCtx, eval, /*genNested=*/true, - currentLocation, beginClauseList); - break; - case llvm::omp::Directive::OMPD_parallel: - genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/true, - currentLocation, beginClauseList); - break; - case llvm::omp::Directive::OMPD_single: - genSingleOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation, - beginClauseList, endClauseList); - break; - case llvm::omp::Directive::OMPD_target: - genTargetOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation, + std::optional nextDir = origDirective; + bool outermostLeafConstruct = true; + while (nextDir) { + llvm::omp::Directive leafDir; + std::tie(leafDir, nextDir) = splitCombinedDirective(*nextDir); + const bool genNested = !nextDir; + const bool outerCombined = outermostLeafConstruct && nextDir.has_value(); + switch (leafDir) { + case llvm::omp::Directive::OMPD_master: + // 2.16 MASTER construct. + genMasterOp(converter, semaCtx, eval, genNested, currentLocation); + break; + case llvm::omp::Directive::OMPD_ordered: + // 2.17.9 ORDERED construct. + genOrderedRegionOp(converter, semaCtx, eval, genNested, currentLocation, + beginClauseList); + break; + case llvm::omp::Directive::OMPD_parallel: + // 2.6 PARALLEL construct. + genParallelOp(converter, symTable, semaCtx, eval, genNested, + currentLocation, beginClauseList, outerCombined); + break; + case llvm::omp::Directive::OMPD_single: + // 2.8.2 SINGLE construct. + genSingleOp(converter, semaCtx, eval, genNested, currentLocation, + beginClauseList, endClauseList); + break; + case llvm::omp::Directive::OMPD_target: + // 2.12.5 TARGET construct. + genTargetOp(converter, semaCtx, eval, genNested, currentLocation, + beginClauseList, outerCombined); + break; + case llvm::omp::Directive::OMPD_target_data: + // 2.12.2 TARGET DATA construct. + genTargetDataOp(converter, semaCtx, eval, genNested, currentLocation, + beginClauseList); + break; + case llvm::omp::Directive::OMPD_task: + // 2.10.1 TASK construct. + genTaskOp(converter, semaCtx, eval, genNested, currentLocation, beginClauseList); - break; - case llvm::omp::Directive::OMPD_target_data: - genTargetDataOp(converter, semaCtx, eval, /*genNested=*/true, - currentLocation, beginClauseList); - break; - case llvm::omp::Directive::OMPD_task: - genTaskOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation, - beginClauseList); - break; - case llvm::omp::Directive::OMPD_taskgroup: - genTaskgroupOp(converter, semaCtx, eval, /*genNested=*/true, - currentLocation, beginClauseList); - break; - case llvm::omp::Directive::OMPD_teams: - genTeamsOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation, - beginClauseList); - break; - case llvm::omp::Directive::OMPD_workshare: - // FIXME: Workshare is not a commonly used OpenMP construct, an - // implementation for this feature will come later. For the codes - // that use this construct, add a single construct for now. - genSingleOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation, - beginClauseList, endClauseList); - break; - default: - singleDirective = false; - break; - } - - if (singleDirective) - return; - - // Codegen for combined directives - bool combinedDirective = false; - if ((llvm::omp::allTargetSet & llvm::omp::blockConstructSet) - .test(directive.v)) { - genTargetOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation, - beginClauseList, /*outerCombined=*/true); - combinedDirective = true; - } - if ((llvm::omp::allTeamsSet & llvm::omp::blockConstructSet) - .test(directive.v)) { - genTeamsOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation, - beginClauseList); - combinedDirective = true; - } - if ((llvm::omp::allParallelSet & llvm::omp::blockConstructSet) - .test(directive.v)) { - bool outerCombined = - directive.v != llvm::omp::Directive::OMPD_target_parallel; - genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false, - currentLocation, beginClauseList, outerCombined); - combinedDirective = true; - } - if ((llvm::omp::workShareSet & llvm::omp::blockConstructSet) - .test(directive.v)) { - genSingleOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation, - beginClauseList, endClauseList); - combinedDirective = true; + break; + case llvm::omp::Directive::OMPD_taskgroup: + // 2.17.6 TASKGROUP construct. + genTaskgroupOp(converter, semaCtx, eval, genNested, currentLocation, + beginClauseList); + break; + case llvm::omp::Directive::OMPD_teams: + // 2.7 TEAMS construct. + // FIXME Pass the outerCombined argument or rename it to better describe + // what it represents if it must always be `false` in this context. + genTeamsOp(converter, semaCtx, eval, genNested, currentLocation, + beginClauseList); + break; + case llvm::omp::Directive::OMPD_workshare: + // 2.8.3 WORKSHARE construct. + // FIXME: Workshare is not a commonly used OpenMP construct, an + // implementation for this feature will come later. For the codes + // that use this construct, add a single construct for now. + genSingleOp(converter, semaCtx, eval, genNested, currentLocation, + beginClauseList, endClauseList); + break; + default: + llvm_unreachable("Unexpected block construct"); + break; + } + outermostLeafConstruct = false; } - if (!combinedDirective) - TODO(currentLocation, "Unhandled block directive (" + - llvm::omp::getOpenMPDirectiveName(directive.v) + - ")"); - - genNestedEvaluations(converter, eval); } static void @@ -2248,9 +2346,12 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, std::get(beginLoopDirective.t); mlir::Location currentLocation = converter.genLocation(beginLoopDirective.source); - const auto ompDirective = + const auto origDirective = std::get(beginLoopDirective.t).v; + assert(llvm::omp::loopConstructSet.test(origDirective) && + "Expected loop construct"); + const auto *endClauseList = [&]() { using RetTy = const Fortran::parser::OmpClauseList *; if (auto &endLoopDirective = @@ -2262,56 +2363,103 @@ static void genOMP(Fortran::lower::AbstractConverter &converter, return RetTy(); }(); - bool validDirective = false; - if (llvm::omp::topTaskloopSet.test(ompDirective)) { - validDirective = true; - genTaskloopOp(converter, semaCtx, eval, currentLocation, beginClauseList); - } else { - // Create omp.{target, teams, distribute, parallel} nested operations - if ((llvm::omp::allTargetSet & llvm::omp::loopConstructSet) - .test(ompDirective)) { - validDirective = true; - genTargetOp(converter, semaCtx, eval, /*genNested=*/false, - currentLocation, beginClauseList, /*outerCombined=*/true); - } - if ((llvm::omp::allTeamsSet & llvm::omp::loopConstructSet) - .test(ompDirective)) { - validDirective = true; - genTeamsOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation, - beginClauseList, /*outerCombined=*/true); - } - if (llvm::omp::allDistributeSet.test(ompDirective)) { - validDirective = true; - genDistributeOp(converter, semaCtx, eval, /*genNested=*/false, - currentLocation, beginClauseList); - } - if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet) - .test(ompDirective)) { - validDirective = true; - genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false, - currentLocation, beginClauseList, /*outerCombined=*/true); + std::optional nextDir = origDirective; + while (nextDir) { + llvm::omp::Directive leafDir; + std::tie(leafDir, nextDir) = splitCombinedDirective(*nextDir); + if (llvm::omp::compositeConstructSet.test(leafDir)) { + assert(!nextDir && "Composite construct cannot be split"); + switch (leafDir) { + case llvm::omp::Directive::OMPD_distribute_parallel_do: + // 2.9.4.3 DISTRIBUTE PARALLEL Worksharing-Loop construct. + genCompositeDistributeParallelDo(converter, semaCtx, eval, + beginClauseList, endClauseList, + currentLocation); + break; + case llvm::omp::Directive::OMPD_distribute_parallel_do_simd: + // 2.9.4.4 DISTRIBUTE PARALLEL Worksharing-Loop SIMD construct. + genCompositeDistributeParallelDoSimd(converter, semaCtx, eval, + beginClauseList, endClauseList, + currentLocation); + break; + case llvm::omp::Directive::OMPD_distribute_simd: + // 2.9.4.2 DISTRIBUTE SIMD construct. + genCompositeDistributeSimd(converter, semaCtx, eval, beginClauseList, + endClauseList, currentLocation); + break; + case llvm::omp::Directive::OMPD_do_simd: + // 2.9.3.2 Worksharing-Loop SIMD construct. + genCompositeDoSimd(converter, semaCtx, eval, beginClauseList, + endClauseList, currentLocation); + break; + case llvm::omp::Directive::OMPD_taskloop_simd: + // 2.10.3 TASKLOOP SIMD construct. + genCompositeTaskloopSimd(converter, semaCtx, eval, beginClauseList, + endClauseList, currentLocation); + break; + default: + llvm_unreachable("Unexpected composite construct"); + } + } else { + const bool genNested = !nextDir; + switch (leafDir) { + case llvm::omp::Directive::OMPD_distribute: + // 2.9.4.1 DISTRIBUTE construct. + genDistributeOp(converter, semaCtx, eval, genNested, currentLocation, + beginClauseList); + break; + case llvm::omp::Directive::OMPD_do: + // 2.9.2 Worksharing-Loop construct. + genWsloopOp(converter, semaCtx, eval, currentLocation, beginClauseList, + endClauseList); + break; + case llvm::omp::Directive::OMPD_parallel: + // 2.6 PARALLEL construct. + // FIXME This is not necessarily always the outer leaf construct of a + // combined construct in this context (e.g. DISTRIBUTE PARALLEL DO). + // Maybe rename the argument if it represents something else or + // initialize it properly. + genParallelOp(converter, symTable, semaCtx, eval, genNested, + currentLocation, beginClauseList, + /*outerCombined=*/true); + break; + case llvm::omp::Directive::OMPD_simd: + // 2.9.3.1 SIMD construct. + genSimdOp(converter, semaCtx, eval, currentLocation, beginClauseList); + break; + case llvm::omp::Directive::OMPD_target: + // 2.12.5 TARGET construct. + genTargetOp(converter, semaCtx, eval, genNested, currentLocation, + beginClauseList, /*outerCombined=*/true); + break; + case llvm::omp::Directive::OMPD_taskloop: + // 2.10.2 TASKLOOP construct. + genTaskloopOp(converter, semaCtx, eval, currentLocation, + beginClauseList); + break; + case llvm::omp::Directive::OMPD_teams: + // 2.7 TEAMS construct. + // FIXME This is not necessarily always the outer leaf construct of a + // combined construct in this constext (e.g. TARGET TEAMS DISTRIBUTE). + // Maybe rename the argument if it represents something else or + // initialize it properly. + genTeamsOp(converter, semaCtx, eval, genNested, currentLocation, + beginClauseList, /*outerCombined=*/true); + break; + case llvm::omp::Directive::OMPD_loop: + case llvm::omp::Directive::OMPD_masked: + case llvm::omp::Directive::OMPD_master: + case llvm::omp::Directive::OMPD_tile: + case llvm::omp::Directive::OMPD_unroll: + TODO(currentLocation, "Unhandled loop directive (" + + llvm::omp::getOpenMPDirectiveName(leafDir) + + ")"); + break; + default: + llvm_unreachable("Unexpected loop construct"); + } } } - if ((llvm::omp::allDoSet | llvm::omp::allSimdSet).test(ompDirective)) - validDirective = true; - - if (!validDirective) { - TODO(currentLocation, "Unhandled loop directive (" + - llvm::omp::getOpenMPDirectiveName(ompDirective) + - ")"); - } - - if (llvm::omp::allDoSimdSet.test(ompDirective)) { - // 2.9.3.2 Workshare SIMD construct - genCompositeDoSimd(converter, semaCtx, eval, ompDirective, beginClauseList, - endClauseList, currentLocation); - } else if (llvm::omp::allSimdSet.test(ompDirective)) { - // 2.9.3.1 SIMD construct - genSimdOp(converter, semaCtx, eval, currentLocation, beginClauseList); - } else { - genWsloopOp(converter, semaCtx, eval, currentLocation, beginClauseList, - endClauseList); - } } static void From 06eedffe0d2782922e63cc25cb927f4acdaf7b30 Mon Sep 17 00:00:00 2001 From: NagyDonat Date: Wed, 17 Apr 2024 13:26:51 +0200 Subject: [PATCH 240/300] [analyzer] Use explicit call description mode in iterator checkers (#88913) This commit explicitly specifies the matching mode (C library function, any non-method function, or C++ method) for the `CallDescription`s constructed in the iterator/container checkers. This change won't cause major functional changes, but isn't NFC because it ensures that e.g. call descriptions for a non-method function won't accidentally match a method that has the same name. Separate commits will perform (or have already performed) this change in other checkers. My goal is to ensure that the call description mode is always explicitly specified and eliminate (or strongly restrict) the vague "may be either a method or a simple function" mode that's the current default. I'm handling the iterator checkers in this separate commit because they're infamously complex; but I don't expect any trouble because this transition doesn't interact with the "central" logic of iterator handling. --- .../Checkers/ContainerModeling.cpp | 33 +++++---- .../Checkers/DebugContainerModeling.cpp | 4 +- .../Checkers/DebugIteratorModeling.cpp | 6 +- .../Checkers/IteratorModeling.cpp | 7 +- .../Checkers/IteratorRangeChecker.cpp | 11 ++- .../Checkers/STLAlgorithmModeling.cpp | 67 ++++++++++++------- 6 files changed, 80 insertions(+), 48 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/ContainerModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/ContainerModeling.cpp index 009c0d3fb93686..55ed809bfed6ce 100644 --- a/clang/lib/StaticAnalyzer/Checkers/ContainerModeling.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/ContainerModeling.cpp @@ -72,26 +72,31 @@ class ContainerModeling SVal) const; CallDescriptionMap NoIterParamFunctions = { - {{{"clear"}, 0}, &ContainerModeling::handleClear}, - {{{"assign"}, 2}, &ContainerModeling::handleAssign}, - {{{"push_back"}, 1}, &ContainerModeling::handlePushBack}, - {{{"emplace_back"}, 1}, &ContainerModeling::handlePushBack}, - {{{"pop_back"}, 0}, &ContainerModeling::handlePopBack}, - {{{"push_front"}, 1}, &ContainerModeling::handlePushFront}, - {{{"emplace_front"}, 1}, &ContainerModeling::handlePushFront}, - {{{"pop_front"}, 0}, &ContainerModeling::handlePopFront}, + {{CDM::CXXMethod, {"clear"}, 0}, &ContainerModeling::handleClear}, + {{CDM::CXXMethod, {"assign"}, 2}, &ContainerModeling::handleAssign}, + {{CDM::CXXMethod, {"push_back"}, 1}, &ContainerModeling::handlePushBack}, + {{CDM::CXXMethod, {"emplace_back"}, 1}, + &ContainerModeling::handlePushBack}, + {{CDM::CXXMethod, {"pop_back"}, 0}, &ContainerModeling::handlePopBack}, + {{CDM::CXXMethod, {"push_front"}, 1}, + &ContainerModeling::handlePushFront}, + {{CDM::CXXMethod, {"emplace_front"}, 1}, + &ContainerModeling::handlePushFront}, + {{CDM::CXXMethod, {"pop_front"}, 0}, &ContainerModeling::handlePopFront}, }; CallDescriptionMap OneIterParamFunctions = { - {{{"insert"}, 2}, &ContainerModeling::handleInsert}, - {{{"emplace"}, 2}, &ContainerModeling::handleInsert}, - {{{"erase"}, 1}, &ContainerModeling::handleErase}, - {{{"erase_after"}, 1}, &ContainerModeling::handleEraseAfter}, + {{CDM::CXXMethod, {"insert"}, 2}, &ContainerModeling::handleInsert}, + {{CDM::CXXMethod, {"emplace"}, 2}, &ContainerModeling::handleInsert}, + {{CDM::CXXMethod, {"erase"}, 1}, &ContainerModeling::handleErase}, + {{CDM::CXXMethod, {"erase_after"}, 1}, + &ContainerModeling::handleEraseAfter}, }; CallDescriptionMap TwoIterParamFunctions = { - {{{"erase"}, 2}, &ContainerModeling::handleErase}, - {{{"erase_after"}, 2}, &ContainerModeling::handleEraseAfter}, + {{CDM::CXXMethod, {"erase"}, 2}, &ContainerModeling::handleErase}, + {{CDM::CXXMethod, {"erase_after"}, 2}, + &ContainerModeling::handleEraseAfter}, }; }; diff --git a/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp index 72186a99d94358..d3830a01dd0cbd 100644 --- a/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp @@ -42,9 +42,9 @@ class DebugContainerModeling CheckerContext &) const; CallDescriptionMap Callbacks = { - {{{"clang_analyzer_container_begin"}, 1}, + {{CDM::SimpleFunc, {"clang_analyzer_container_begin"}, 1}, &DebugContainerModeling::analyzerContainerBegin}, - {{{"clang_analyzer_container_end"}, 1}, + {{CDM::SimpleFunc, {"clang_analyzer_container_end"}, 1}, &DebugContainerModeling::analyzerContainerEnd}, }; diff --git a/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp index 79ab71d7829db7..203743dacda636 100644 --- a/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp @@ -43,11 +43,11 @@ class DebugIteratorModeling CheckerContext &) const; CallDescriptionMap Callbacks = { - {{{"clang_analyzer_iterator_position"}, 1}, + {{CDM::SimpleFunc, {"clang_analyzer_iterator_position"}, 1}, &DebugIteratorModeling::analyzerIteratorPosition}, - {{{"clang_analyzer_iterator_container"}, 1}, + {{CDM::SimpleFunc, {"clang_analyzer_iterator_container"}, 1}, &DebugIteratorModeling::analyzerIteratorContainer}, - {{{"clang_analyzer_iterator_validity"}, 1}, + {{CDM::SimpleFunc, {"clang_analyzer_iterator_validity"}, 1}, &DebugIteratorModeling::analyzerIteratorValidity}, }; diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp index a95e811c2a4181..5649454b4cd47e 100644 --- a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp @@ -129,19 +129,20 @@ class IteratorModeling CallDescriptionMap AdvanceLikeFunctions = { // template // void advance(InputIt& it, Distance n); - {{{"std", "advance"}, 2}, &IteratorModeling::handleAdvance}, + {{CDM::SimpleFunc, {"std", "advance"}, 2}, + &IteratorModeling::handleAdvance}, // template // BidirIt prev( // BidirIt it, // typename std::iterator_traits::difference_type n = 1); - {{{"std", "prev"}, 2}, &IteratorModeling::handlePrev}, + {{CDM::SimpleFunc, {"std", "prev"}, 2}, &IteratorModeling::handlePrev}, // template // ForwardIt next( // ForwardIt it, // typename std::iterator_traits::difference_type n = 1); - {{{"std", "next"}, 2}, &IteratorModeling::handleNext}, + {{CDM::SimpleFunc, {"std", "next"}, 2}, &IteratorModeling::handleNext}, }; public: diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp index d2b61fb92483c3..4dd2f700a2a0eb 100644 --- a/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp @@ -56,10 +56,15 @@ class IteratorRangeChecker using AdvanceFn = void (IteratorRangeChecker::*)(CheckerContext &, SVal, SVal) const; + // FIXME: these three functions are also listed in IteratorModeling.cpp, + // perhaps unify their handling? CallDescriptionMap AdvanceFunctions = { - {{{"std", "advance"}, 2}, &IteratorRangeChecker::verifyAdvance}, - {{{"std", "prev"}, 2}, &IteratorRangeChecker::verifyPrev}, - {{{"std", "next"}, 2}, &IteratorRangeChecker::verifyNext}, + {{CDM::SimpleFunc, {"std", "advance"}, 2}, + &IteratorRangeChecker::verifyAdvance}, + {{CDM::SimpleFunc, {"std", "prev"}, 2}, + &IteratorRangeChecker::verifyPrev}, + {{CDM::SimpleFunc, {"std", "next"}, 2}, + &IteratorRangeChecker::verifyNext}, }; }; diff --git a/clang/lib/StaticAnalyzer/Checkers/STLAlgorithmModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/STLAlgorithmModeling.cpp index a5173a05636a09..e037719b902986 100644 --- a/clang/lib/StaticAnalyzer/Checkers/STLAlgorithmModeling.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/STLAlgorithmModeling.cpp @@ -33,29 +33,50 @@ class STLAlgorithmModeling : public Checker { const CallExpr *) const; const CallDescriptionMap Callbacks = { - {{{"std", "find"}, 3}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find"}, 4}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find_if"}, 3}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find_if"}, 4}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find_if_not"}, 3}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find_if_not"}, 4}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find_first_of"}, 4}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find_first_of"}, 5}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find_first_of"}, 6}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find_end"}, 4}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find_end"}, 5}, &STLAlgorithmModeling::evalFind}, - {{{"std", "find_end"}, 6}, &STLAlgorithmModeling::evalFind}, - {{{"std", "lower_bound"}, 3}, &STLAlgorithmModeling::evalFind}, - {{{"std", "lower_bound"}, 4}, &STLAlgorithmModeling::evalFind}, - {{{"std", "upper_bound"}, 3}, &STLAlgorithmModeling::evalFind}, - {{{"std", "upper_bound"}, 4}, &STLAlgorithmModeling::evalFind}, - {{{"std", "search"}, 3}, &STLAlgorithmModeling::evalFind}, - {{{"std", "search"}, 4}, &STLAlgorithmModeling::evalFind}, - {{{"std", "search"}, 5}, &STLAlgorithmModeling::evalFind}, - {{{"std", "search"}, 6}, &STLAlgorithmModeling::evalFind}, - {{{"std", "search_n"}, 4}, &STLAlgorithmModeling::evalFind}, - {{{"std", "search_n"}, 5}, &STLAlgorithmModeling::evalFind}, - {{{"std", "search_n"}, 6}, &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find"}, 3}, &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find"}, 4}, &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find_if"}, 3}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find_if"}, 4}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find_if_not"}, 3}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find_if_not"}, 4}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find_first_of"}, 4}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find_first_of"}, 5}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find_first_of"}, 6}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find_end"}, 4}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find_end"}, 5}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "find_end"}, 6}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "lower_bound"}, 3}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "lower_bound"}, 4}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "upper_bound"}, 3}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "upper_bound"}, 4}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "search"}, 3}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "search"}, 4}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "search"}, 5}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "search"}, 6}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "search_n"}, 4}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "search_n"}, 5}, + &STLAlgorithmModeling::evalFind}, + {{CDM::SimpleFunc, {"std", "search_n"}, 6}, + &STLAlgorithmModeling::evalFind}, }; public: From 631c5e818ef8bb0f61fd3bb44cc4449be2142e2b Mon Sep 17 00:00:00 2001 From: MbjYjbpivj Date: Wed, 17 Apr 2024 20:15:30 +0800 Subject: [PATCH 241/300] [mlir] fix intNEQValue summary (#89029) Fix the summary of intNEQValue. --- mlir/include/mlir/IR/CommonAttrConstraints.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td index 0312ac7ec1d8df..0d69bb0717a599 100644 --- a/mlir/include/mlir/IR/CommonAttrConstraints.td +++ b/mlir/include/mlir/IR/CommonAttrConstraints.td @@ -755,7 +755,7 @@ class AllAttrOf constraints> : AttrConstraint< class IntNEQValue : AttrConstraint< CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getInt() != " # n>, - "whose minimum value is " # n>; + "whose value is not " # n>; class IntMinValue : AttrConstraint< CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getInt() >= " # n>, From d57907d0b4f292f148310695ed011fe5a0585d6b Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 17 Apr 2024 08:21:48 -0400 Subject: [PATCH 242/300] [libc++] Add missing iterator requirement checks in the PSTL (#88127) Also add tests for those, and add a few missing requirements to testing iterators in the test suite. --- .../__algorithm/pstl_any_all_none_of.h | 6 +- libcxx/include/__algorithm/pstl_copy.h | 13 ++ libcxx/include/__algorithm/pstl_count.h | 5 + libcxx/include/__algorithm/pstl_equal.h | 9 + libcxx/include/__algorithm/pstl_fill.h | 6 +- libcxx/include/__algorithm/pstl_find.h | 6 +- libcxx/include/__algorithm/pstl_for_each.h | 4 +- libcxx/include/__algorithm/pstl_generate.h | 5 +- .../include/__algorithm/pstl_is_partitioned.h | 2 + libcxx/include/__algorithm/pstl_merge.h | 5 + libcxx/include/__algorithm/pstl_move.h | 5 + libcxx/include/__algorithm/pstl_replace.h | 13 ++ libcxx/include/__algorithm/pstl_rotate_copy.h | 5 + libcxx/include/__algorithm/pstl_sort.h | 3 + libcxx/include/__algorithm/pstl_stable_sort.h | 2 + libcxx/include/__algorithm/pstl_transform.h | 16 +- .../__iterator/cpp17_iterator_concepts.h | 38 ++-- libcxx/include/__numeric/pstl_reduce.h | 3 + .../include/__numeric/pstl_transform_reduce.h | 6 + .../cpp17_iterator_concepts.verify.cpp | 125 ++++++------ .../pstl.iterator-requirements.verify.cpp | 192 ++++++++++++++++++ libcxx/test/support/test_iterators.h | 16 +- 22 files changed, 383 insertions(+), 102 deletions(-) create mode 100644 libcxx/test/libcxx/algorithms/pstl.iterator-requirements.verify.cpp diff --git a/libcxx/include/__algorithm/pstl_any_all_none_of.h b/libcxx/include/__algorithm/pstl_any_all_none_of.h index 4b1e0e61b54218..911a7e42b3fa3f 100644 --- a/libcxx/include/__algorithm/pstl_any_all_none_of.h +++ b/libcxx/include/__algorithm/pstl_any_all_none_of.h @@ -60,7 +60,7 @@ template , int> = 0> _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool any_of(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "any_of requires a ForwardIterator"); auto __res = std::__any_of(__policy, std::move(__first), std::move(__last), std::move(__pred)); if (!__res) std::__throw_bad_alloc(); @@ -99,7 +99,7 @@ template , int> = 0> _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool all_of(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Pred __pred) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "all_of requires a ForwardIterator"); auto __res = std::__all_of(__policy, std::move(__first), std::move(__last), std::move(__pred)); if (!__res) std::__throw_bad_alloc(); @@ -136,7 +136,7 @@ template , int> = 0> _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool none_of(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Pred __pred) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "none_of requires a ForwardIterator"); auto __res = std::__none_of(__policy, std::move(__first), std::move(__last), std::move(__pred)); if (!__res) std::__throw_bad_alloc(); diff --git a/libcxx/include/__algorithm/pstl_copy.h b/libcxx/include/__algorithm/pstl_copy.h index 1069dcec0e117a..f35bb9713ef140 100644 --- a/libcxx/include/__algorithm/pstl_copy.h +++ b/libcxx/include/__algorithm/pstl_copy.h @@ -16,6 +16,7 @@ #include <__config> #include <__functional/identity.h> #include <__iterator/concepts.h> +#include <__iterator/cpp17_iterator_concepts.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_constant_evaluated.h> #include <__type_traits/is_execution_policy.h> @@ -67,6 +68,12 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator copy(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _ForwardOutIterator __result) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR( + _ForwardIterator, "copy(first, last, result) requires [first, last) to be ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR( + _ForwardOutIterator, "copy(first, last, result) requires result to be a ForwardIterator"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR( + _ForwardOutIterator, decltype(*__first), "copy(first, last, result) requires result to be an OutputIterator"); auto __res = std::__copy(__policy, std::move(__first), std::move(__last), std::move(__result)); if (!__res) std::__throw_bad_alloc(); @@ -106,6 +113,12 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator copy_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _Size __n, _ForwardOutIterator __result) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR( + _ForwardIterator, "copy_n(first, n, result) requires first to be a ForwardIterator"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR( + _ForwardOutIterator, "copy_n(first, n, result) requires result to be a ForwardIterator"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR( + _ForwardOutIterator, decltype(*__first), "copy_n(first, n, result) requires result to be an OutputIterator"); auto __res = std::__copy_n(__policy, std::move(__first), std::move(__n), std::move(__result)); if (!__res) std::__throw_bad_alloc(); diff --git a/libcxx/include/__algorithm/pstl_count.h b/libcxx/include/__algorithm/pstl_count.h index 2781f6bfd3c9e0..6ff57cac334eb0 100644 --- a/libcxx/include/__algorithm/pstl_count.h +++ b/libcxx/include/__algorithm/pstl_count.h @@ -17,6 +17,7 @@ #include <__atomic/atomic.h> #include <__config> #include <__functional/operations.h> +#include <__iterator/cpp17_iterator_concepts.h> #include <__iterator/iterator_traits.h> #include <__numeric/pstl_transform_reduce.h> #include <__type_traits/enable_if.h> @@ -70,6 +71,8 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI __iter_diff_t<_ForwardIterator> count_if(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR( + _ForwardIterator, "count_if(first, last, pred) requires [first, last) to be ForwardIterators"); auto __res = std::__count_if(__policy, std::move(__first), std::move(__last), std::move(__pred)); if (!__res) std::__throw_bad_alloc(); @@ -106,6 +109,8 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI __iter_diff_t<_ForwardIterator> count(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR( + _ForwardIterator, "count(first, last, val) requires [first, last) to be ForwardIterators"); auto __res = std::__count(__policy, std::move(__first), std::move(__last), __value); if (!__res) std::__throw_bad_alloc(); diff --git a/libcxx/include/__algorithm/pstl_equal.h b/libcxx/include/__algorithm/pstl_equal.h index d235c0f4f41972..0b38197d7f63df 100644 --- a/libcxx/include/__algorithm/pstl_equal.h +++ b/libcxx/include/__algorithm/pstl_equal.h @@ -13,6 +13,7 @@ #include <__algorithm/pstl_frontend_dispatch.h> #include <__config> #include <__functional/operations.h> +#include <__iterator/cpp17_iterator_concepts.h> #include <__iterator/iterator_traits.h> #include <__numeric/pstl_transform_reduce.h> #include <__utility/move.h> @@ -74,6 +75,8 @@ equal(_ExecutionPolicy&& __policy, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Pred __pred) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators"); auto __res = std::__equal(__policy, std::move(__first1), std::move(__last1), std::move(__first2), std::move(__pred)); if (!__res) std::__throw_bad_alloc(); @@ -86,6 +89,8 @@ template >, int> = 0> _LIBCPP_HIDE_FROM_ABI bool equal(_ExecutionPolicy&& __policy, _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators"); return std::equal(__policy, std::move(__first1), std::move(__last1), std::move(__first2), std::equal_to{}); } @@ -145,6 +150,8 @@ equal(_ExecutionPolicy&& __policy, _ForwardIterator2 __first2, _ForwardIterator2 __last2, _Pred __pred) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators"); auto __res = std::__equal( __policy, std::move(__first1), std::move(__last1), std::move(__first2), std::move(__last2), std::move(__pred)); if (!__res) @@ -162,6 +169,8 @@ equal(_ExecutionPolicy&& __policy, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators"); return std::equal( __policy, std::move(__first1), std::move(__last1), std::move(__first2), std::move(__last2), std::equal_to{}); } diff --git a/libcxx/include/__algorithm/pstl_fill.h b/libcxx/include/__algorithm/pstl_fill.h index 488b49a0feec96..fd248506bc4b96 100644 --- a/libcxx/include/__algorithm/pstl_fill.h +++ b/libcxx/include/__algorithm/pstl_fill.h @@ -43,7 +43,6 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI optional<__empty> __fill(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) noexcept { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); return std::__pstl_frontend_dispatch( _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_fill, _RawPolicy), [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value) { @@ -63,7 +62,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI void fill(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "fill requires ForwardIterators"); if (!std::__fill(__policy, std::move(__first), std::move(__last), __value)) std::__throw_bad_alloc(); } @@ -79,7 +78,6 @@ template , int> = 0> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty> __fill_n(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _SizeT&& __n, const _Tp& __value) noexcept { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); return std::__pstl_frontend_dispatch( _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_fill_n, _RawPolicy), [&](_ForwardIterator __g_first, _SizeT __g_n, const _Tp& __g_value) { @@ -102,7 +100,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI void fill_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _SizeT __n, const _Tp& __value) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "fill_n requires ForwardIterators"); if (!std::__fill_n(__policy, std::move(__first), std::move(__n), __value)) std::__throw_bad_alloc(); } diff --git a/libcxx/include/__algorithm/pstl_find.h b/libcxx/include/__algorithm/pstl_find.h index 5b694db68aead4..3b30a7bc9b456f 100644 --- a/libcxx/include/__algorithm/pstl_find.h +++ b/libcxx/include/__algorithm/pstl_find.h @@ -50,7 +50,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI _ForwardIterator find_if(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "find_if requires ForwardIterators"); auto __res = std::__find_if(__policy, std::move(__first), std::move(__last), std::move(__pred)); if (!__res) std::__throw_bad_alloc(); @@ -88,7 +88,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI _ForwardIterator find_if_not(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "find_if_not requires ForwardIterators"); auto __res = std::__find_if_not(__policy, std::move(__first), std::move(__last), std::move(__pred)); if (!__res) std::__throw_bad_alloc(); @@ -125,7 +125,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI _ForwardIterator find(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "find requires ForwardIterators"); auto __res = std::__find(__policy, std::move(__first), std::move(__last), __value); if (!__res) std::__throw_bad_alloc(); diff --git a/libcxx/include/__algorithm/pstl_for_each.h b/libcxx/include/__algorithm/pstl_for_each.h index bb7b5a61a6dc0d..a9ebed74a62fd4 100644 --- a/libcxx/include/__algorithm/pstl_for_each.h +++ b/libcxx/include/__algorithm/pstl_for_each.h @@ -53,7 +53,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI void for_each(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Function __func) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "for_each requires ForwardIterators"); if (!std::__for_each(__policy, std::move(__first), std::move(__last), std::move(__func))) std::__throw_bad_alloc(); } @@ -93,7 +93,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI void for_each_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _Size __size, _Function __func) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "for_each_n requires a ForwardIterator"); auto __res = std::__for_each_n(__policy, std::move(__first), std::move(__size), std::move(__func)); if (!__res) std::__throw_bad_alloc(); diff --git a/libcxx/include/__algorithm/pstl_generate.h b/libcxx/include/__algorithm/pstl_generate.h index 7133c6f4f4c621..886af290d7f25a 100644 --- a/libcxx/include/__algorithm/pstl_generate.h +++ b/libcxx/include/__algorithm/pstl_generate.h @@ -42,7 +42,6 @@ template , int> = 0> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty> __generate(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Generator&& __gen) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); return std::__pstl_frontend_dispatch( _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_generate, _RawPolicy), [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _Generator __g_gen) { @@ -63,7 +62,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI void generate(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Generator __gen) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "generate requires ForwardIterators"); if (!std::__generate(__policy, std::move(__first), std::move(__last), std::move(__gen))) std::__throw_bad_alloc(); } @@ -100,7 +99,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI void generate_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _Size __n, _Generator __gen) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "generate_n requires a ForwardIterator"); if (!std::__generate_n(__policy, std::move(__first), std::move(__n), std::move(__gen))) std::__throw_bad_alloc(); } diff --git a/libcxx/include/__algorithm/pstl_is_partitioned.h b/libcxx/include/__algorithm/pstl_is_partitioned.h index b6543021220727..108bb1e4325260 100644 --- a/libcxx/include/__algorithm/pstl_is_partitioned.h +++ b/libcxx/include/__algorithm/pstl_is_partitioned.h @@ -14,6 +14,7 @@ #include <__algorithm/pstl_find.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__config> +#include <__iterator/cpp17_iterator_concepts.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> @@ -62,6 +63,7 @@ template , int> = 0> _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool is_partitioned(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "is_partitioned requires ForwardIterators"); auto __res = std::__is_partitioned(__policy, std::move(__first), std::move(__last), std::move(__pred)); if (!__res) std::__throw_bad_alloc(); diff --git a/libcxx/include/__algorithm/pstl_merge.h b/libcxx/include/__algorithm/pstl_merge.h index 3d262db6bc0c15..d03cd8c7fbd580 100644 --- a/libcxx/include/__algorithm/pstl_merge.h +++ b/libcxx/include/__algorithm/pstl_merge.h @@ -12,6 +12,7 @@ #include <__algorithm/pstl_backend.h> #include <__config> #include <__functional/operations.h> +#include <__iterator/cpp17_iterator_concepts.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> @@ -70,6 +71,10 @@ merge(_ExecutionPolicy&& __policy, _ForwardIterator2 __last2, _ForwardOutIterator __result, _Comp __comp = {}) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "merge requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "merge requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, decltype(*__first1), "merge requires an OutputIterator"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, decltype(*__first2), "merge requires an OutputIterator"); auto __res = std::__merge( __policy, std::move(__first1), diff --git a/libcxx/include/__algorithm/pstl_move.h b/libcxx/include/__algorithm/pstl_move.h index d8441f1a6c2e16..f4c8c1fbb2e876 100644 --- a/libcxx/include/__algorithm/pstl_move.h +++ b/libcxx/include/__algorithm/pstl_move.h @@ -15,6 +15,7 @@ #include <__algorithm/pstl_transform.h> #include <__config> #include <__functional/identity.h> +#include <__iterator/cpp17_iterator_concepts.h> #include <__iterator/iterator_traits.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_constant_evaluated.h> @@ -69,6 +70,10 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator move(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _ForwardOutIterator __result) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "move requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "move requires an OutputIterator"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR( + _ForwardOutIterator, decltype(std::move(*__first)), "move requires an OutputIterator"); auto __res = std::__move(__policy, std::move(__first), std::move(__last), std::move(__result)); if (!__res) std::__throw_bad_alloc(); diff --git a/libcxx/include/__algorithm/pstl_replace.h b/libcxx/include/__algorithm/pstl_replace.h index b1caf3fd4ac0a1..73ac11cda26a9f 100644 --- a/libcxx/include/__algorithm/pstl_replace.h +++ b/libcxx/include/__algorithm/pstl_replace.h @@ -14,6 +14,7 @@ #include <__algorithm/pstl_frontend_dispatch.h> #include <__algorithm/pstl_transform.h> #include <__config> +#include <__iterator/cpp17_iterator_concepts.h> #include <__iterator/iterator_traits.h> #include <__type_traits/enable_if.h> #include <__type_traits/remove_cvref.h> @@ -74,6 +75,7 @@ replace_if(_ExecutionPolicy&& __policy, _ForwardIterator __last, _Pred __pred, const _Tp& __new_value) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "replace_if requires ForwardIterators"); auto __res = std::__replace_if(__policy, std::move(__first), std::move(__last), std::move(__pred), __new_value); if (!__res) std::__throw_bad_alloc(); @@ -121,6 +123,7 @@ replace(_ExecutionPolicy&& __policy, _ForwardIterator __last, const _Tp& __old_value, const _Tp& __new_value) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "replace requires ForwardIterators"); if (!std::__replace(__policy, std::move(__first), std::move(__last), __old_value, __new_value)) std::__throw_bad_alloc(); } @@ -177,6 +180,11 @@ _LIBCPP_HIDE_FROM_ABI void replace_copy_if( _ForwardOutIterator __result, _Pred __pred, const _Tp& __new_value) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "replace_copy_if requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "replace_copy_if requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR( + _ForwardOutIterator, decltype(*__first), "replace_copy_if requires an OutputIterator"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, const _Tp&, "replace_copy requires an OutputIterator"); if (!std::__replace_copy_if( __policy, std::move(__first), std::move(__last), std::move(__result), std::move(__pred), __new_value)) std::__throw_bad_alloc(); @@ -233,6 +241,11 @@ _LIBCPP_HIDE_FROM_ABI void replace_copy( _ForwardOutIterator __result, const _Tp& __old_value, const _Tp& __new_value) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "replace_copy requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "replace_copy requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR( + _ForwardOutIterator, decltype(*__first), "replace_copy requires an OutputIterator"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, const _Tp&, "replace_copy requires an OutputIterator"); if (!std::__replace_copy( __policy, std::move(__first), std::move(__last), std::move(__result), __old_value, __new_value)) std::__throw_bad_alloc(); diff --git a/libcxx/include/__algorithm/pstl_rotate_copy.h b/libcxx/include/__algorithm/pstl_rotate_copy.h index 346aab1d4a55c0..adab3958fe3112 100644 --- a/libcxx/include/__algorithm/pstl_rotate_copy.h +++ b/libcxx/include/__algorithm/pstl_rotate_copy.h @@ -12,6 +12,7 @@ #include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_copy.h> #include <__algorithm/pstl_frontend_dispatch.h> +#include <__iterator/cpp17_iterator_concepts.h> #include <__type_traits/is_execution_policy.h> #include @@ -69,6 +70,10 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator rotate_copy( _ForwardIterator __middle, _ForwardIterator __last, _ForwardOutIterator __result) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "rotate_copy requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "rotate_copy requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR( + _ForwardOutIterator, decltype(*__first), "rotate_copy requires an OutputIterator"); auto __res = std::__rotate_copy(__policy, std::move(__first), std::move(__middle), std::move(__last), std::move(__result)); if (!__res) diff --git a/libcxx/include/__algorithm/pstl_sort.h b/libcxx/include/__algorithm/pstl_sort.h index a931f768111a23..65bc794ca6f4c8 100644 --- a/libcxx/include/__algorithm/pstl_sort.h +++ b/libcxx/include/__algorithm/pstl_sort.h @@ -14,6 +14,7 @@ #include <__algorithm/pstl_stable_sort.h> #include <__config> #include <__functional/operations.h> +#include <__iterator/cpp17_iterator_concepts.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> #include <__utility/empty.h> @@ -60,6 +61,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI void sort(_ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) { + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(_RandomAccessIterator, "sort requires RandomAccessIterators"); if (!std::__sort(__policy, std::move(__first), std::move(__last), std::move(__comp))) std::__throw_bad_alloc(); } @@ -70,6 +72,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI void sort(_ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last) { + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(_RandomAccessIterator, "sort requires RandomAccessIterators"); std::sort(std::forward<_ExecutionPolicy>(__policy), std::move(__first), std::move(__last), less{}); } diff --git a/libcxx/include/__algorithm/pstl_stable_sort.h b/libcxx/include/__algorithm/pstl_stable_sort.h index 8ea0bb3f9a8d59..79b94557e3dc3a 100644 --- a/libcxx/include/__algorithm/pstl_stable_sort.h +++ b/libcxx/include/__algorithm/pstl_stable_sort.h @@ -12,6 +12,7 @@ #include <__algorithm/pstl_backend.h> #include <__config> #include <__functional/operations.h> +#include <__iterator/cpp17_iterator_concepts.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> @@ -48,6 +49,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI void stable_sort( _ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp = {}) { + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(_RandomAccessIterator, "stable_sort requires RandomAccessIterators"); if (!std::__stable_sort(__policy, std::move(__first), std::move(__last), std::move(__comp))) std::__throw_bad_alloc(); } diff --git a/libcxx/include/__algorithm/pstl_transform.h b/libcxx/include/__algorithm/pstl_transform.h index f95938782fc3bd..a01a64a43cf1a3 100644 --- a/libcxx/include/__algorithm/pstl_transform.h +++ b/libcxx/include/__algorithm/pstl_transform.h @@ -58,9 +58,10 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator transform( _ForwardIterator __last, _ForwardOutIterator __result, _UnaryOperation __op) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator); - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator); - _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, decltype(__op(*__first))); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "transform requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "transform requires an OutputIterator"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR( + _ForwardOutIterator, decltype(__op(*__first)), "transform requires an OutputIterator"); auto __res = std::__transform(__policy, std::move(__first), std::move(__last), std::move(__result), std::move(__op)); if (!__res) std::__throw_bad_alloc(); @@ -100,10 +101,11 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator transform( _ForwardIterator2 __first2, _ForwardOutIterator __result, _BinaryOperation __op) { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1); - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2); - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator); - _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, decltype(__op(*__first1, *__first2))); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "transform requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "transform requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "transform requires an OutputIterator"); + _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR( + _ForwardOutIterator, decltype(__op(*__first1, *__first2)), "transform requires an OutputIterator"); auto __res = std::__transform( __policy, std::move(__first1), std::move(__last1), std::move(__first2), std::move(__result), std::move(__op)); if (!__res) diff --git a/libcxx/include/__iterator/cpp17_iterator_concepts.h b/libcxx/include/__iterator/cpp17_iterator_concepts.h index cdb561e68452af..9d5a392582da42 100644 --- a/libcxx/include/__iterator/cpp17_iterator_concepts.h +++ b/libcxx/include/__iterator/cpp17_iterator_concepts.h @@ -157,29 +157,31 @@ concept __cpp17_random_access_iterator = _LIBCPP_END_NAMESPACE_STD # ifndef _LIBCPP_DISABLE_ITERATOR_CHECKS -# define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t) static_assert(::std::__cpp17_input_iterator); -# define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t) \ - static_assert(::std::__cpp17_output_iterator); -# define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t) static_assert(::std::__cpp17_forward_iterator); -# define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t) \ - static_assert(::std::__cpp17_bidirectional_iterator); -# define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t) \ - static_assert(::std::__cpp17_random_access_iterator); +# define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t, message) \ + static_assert(::std::__cpp17_input_iterator, message) +# define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t, message) \ + static_assert(::std::__cpp17_output_iterator, message) +# define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t, message) \ + static_assert(::std::__cpp17_forward_iterator, message) +# define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t, message) \ + static_assert(::std::__cpp17_bidirectional_iterator, message) +# define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t, message) \ + static_assert(::std::__cpp17_random_access_iterator, message) # else -# define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t) -# define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t) -# define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t) -# define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t) -# define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t) +# define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t, message) static_assert(true) +# define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t, message) static_assert(true) +# define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t, message) static_assert(true) +# define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t, message) static_assert(true) +# define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t, message) static_assert(true) # endif #else // _LIBCPP_STD_VER >= 20 -# define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t) -# define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t) -# define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t) -# define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t) -# define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t) +# define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t, message) static_assert(true) +# define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t, message) static_assert(true) +# define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t, message) static_assert(true) +# define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t, message) static_assert(true) +# define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t, message) static_assert(true) #endif // _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__numeric/pstl_reduce.h b/libcxx/include/__numeric/pstl_reduce.h index f9f666c2bb38b8..d678b9480070b0 100644 --- a/libcxx/include/__numeric/pstl_reduce.h +++ b/libcxx/include/__numeric/pstl_reduce.h @@ -12,6 +12,7 @@ #include <__algorithm/pstl_frontend_dispatch.h> #include <__config> #include <__functional/identity.h> +#include <__iterator/cpp17_iterator_concepts.h> #include <__iterator/iterator_traits.h> #include <__numeric/pstl_transform_reduce.h> #include <__type_traits/is_execution_policy.h> @@ -66,6 +67,7 @@ reduce(_ExecutionPolicy&& __policy, _ForwardIterator __last, _Tp __init, _BinaryOperation __op = {}) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "reduce requires ForwardIterators"); auto __res = std::__reduce(__policy, std::move(__first), std::move(__last), std::move(__init), std::move(__op)); if (!__res) std::__throw_bad_alloc(); @@ -94,6 +96,7 @@ template , int> = 0> _LIBCPP_HIDE_FROM_ABI __iter_value_type<_ForwardIterator> reduce(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "reduce requires ForwardIterators"); auto __res = std::__reduce(__policy, std::move(__first), std::move(__last)); if (!__res) std::__throw_bad_alloc(); diff --git a/libcxx/include/__numeric/pstl_transform_reduce.h b/libcxx/include/__numeric/pstl_transform_reduce.h index 07ecf0d9956bb0..2d2621dc8dadb1 100644 --- a/libcxx/include/__numeric/pstl_transform_reduce.h +++ b/libcxx/include/__numeric/pstl_transform_reduce.h @@ -13,6 +13,7 @@ #include <__algorithm/pstl_frontend_dispatch.h> #include <__config> #include <__functional/operations.h> +#include <__iterator/cpp17_iterator_concepts.h> #include <__numeric/transform_reduce.h> #include <__type_traits/is_execution_policy.h> #include <__utility/move.h> @@ -72,6 +73,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp transform_reduce( _Tp __init, _BinaryOperation1 __reduce, _BinaryOperation2 __transform) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "transform_reduce requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "transform_reduce requires ForwardIterators"); auto __res = std::__transform_reduce( __policy, std::move(__first1), @@ -99,6 +102,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp transform_reduce( _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Tp __init) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "transform_reduce requires ForwardIterators"); + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "transform_reduce requires ForwardIterators"); return std::transform_reduce(__policy, __first1, __last1, __first2, __init, plus{}, multiplies{}); } @@ -140,6 +145,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp transform_reduce( _Tp __init, _BinaryOperation __reduce, _UnaryOperation __transform) { + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "transform_reduce requires ForwardIterators"); auto __res = std::__transform_reduce( __policy, std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform)); if (!__res) diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp index 344543d5f19ffe..544a9744b7909a 100644 --- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp +++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp @@ -16,29 +16,29 @@ #include struct missing_deref { - using difference_type = std::ptrdiff_t; + using difference_type = std::ptrdiff_t; using iterator_category = std::input_iterator_tag; - using value_type = int; - using reference = int&; + using value_type = int; + using reference = int&; missing_deref& operator++(); }; struct missing_preincrement { - using difference_type = std::ptrdiff_t; + using difference_type = std::ptrdiff_t; using iterator_category = std::input_iterator_tag; - using value_type = int; - using reference = int&; + using value_type = int; + using reference = int&; int& operator*(); }; template struct valid_iterator { - using difference_type = std::ptrdiff_t; + using difference_type = std::ptrdiff_t; using iterator_category = std::input_iterator_tag; - using value_type = int; - using reference = int&; + using value_type = int; + using reference = int&; int& operator*() const; Derived& operator++(); @@ -51,30 +51,30 @@ struct valid_iterator { }; struct not_move_constructible : valid_iterator { - not_move_constructible(const not_move_constructible&) = default; - not_move_constructible(not_move_constructible&&) = delete; - not_move_constructible& operator=(not_move_constructible&&) = default; + not_move_constructible(const not_move_constructible&) = default; + not_move_constructible(not_move_constructible&&) = delete; + not_move_constructible& operator=(not_move_constructible&&) = default; not_move_constructible& operator=(const not_move_constructible&) = default; }; struct not_copy_constructible : valid_iterator { - not_copy_constructible(const not_copy_constructible&) = delete; - not_copy_constructible(not_copy_constructible&&) = default; - not_copy_constructible& operator=(not_copy_constructible&&) = default; + not_copy_constructible(const not_copy_constructible&) = delete; + not_copy_constructible(not_copy_constructible&&) = default; + not_copy_constructible& operator=(not_copy_constructible&&) = default; not_copy_constructible& operator=(const not_copy_constructible&) = default; }; struct not_move_assignable : valid_iterator { - not_move_assignable(const not_move_assignable&) = default; - not_move_assignable(not_move_assignable&&) = default; - not_move_assignable& operator=(not_move_assignable&&) = delete; + not_move_assignable(const not_move_assignable&) = default; + not_move_assignable(not_move_assignable&&) = default; + not_move_assignable& operator=(not_move_assignable&&) = delete; not_move_assignable& operator=(const not_move_assignable&) = default; }; struct not_copy_assignable : valid_iterator { - not_copy_assignable(const not_copy_assignable&) = default; - not_copy_assignable(not_copy_assignable&&) = default; - not_copy_assignable& operator=(not_copy_assignable&&) = default; + not_copy_assignable(const not_copy_assignable&) = default; + not_copy_assignable(not_copy_assignable&&) = default; + not_copy_assignable& operator=(not_copy_assignable&&) = default; not_copy_assignable& operator=(const not_copy_assignable&) = delete; }; @@ -89,7 +89,6 @@ void check_iterator_requirements() { static_assert(std::__cpp17_iterator); // expected-error {{static assertion failed}} // expected-note@*:* {{cannot increment value of type 'missing_preincrement'}} - static_assert(std::__cpp17_iterator); // expected-error {{static assertion failed}} // expected-note@*:* {{because 'not_move_constructible' does not satisfy '__cpp17_move_constructible'}} @@ -115,11 +114,13 @@ bool operator==(not_unequality_comparable, not_unequality_comparable); bool operator!=(not_unequality_comparable, not_unequality_comparable) = delete; void check_input_iterator_requirements() { - _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_equality_comparable); // expected-error {{static assertion failed}} + // clang-format off + _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_equality_comparable, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{'__lhs == __rhs' would be invalid: overload resolution selected deleted operator '=='}} - _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_unequality_comparable); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_unequality_comparable, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{'__lhs != __rhs' would be invalid: overload resolution selected deleted operator '!='}} + // clang-format on } template @@ -138,9 +139,9 @@ struct postincrement_not_ref : valid_iterator {}; bool operator==(postincrement_not_ref, postincrement_not_ref); void check_forward_iterator_requirements() { - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(not_default_constructible); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(not_default_constructible, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because 'not_default_constructible' does not satisfy '__cpp17_default_constructible'}} - _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref, ""); // expected-error {{static assertion failed}} #ifndef _AIX // expected-note@*:* {{because type constraint 'convertible_to::Proxy, const postincrement_not_ref &>' was not satisfied}} #endif @@ -155,7 +156,6 @@ struct missing_postdecrement : valid_forward_iterator { }; struct not_returning_iter_reference : valid_forward_iterator { - struct Proxy { operator const not_returning_iter_reference&(); @@ -167,12 +167,14 @@ struct not_returning_iter_reference : valid_forward_iterator >' was not satisfied}} + // clang-format on } template @@ -246,7 +248,8 @@ struct missing_minus_const_iter_const_iter : valid_random_access_iterator { @@ -359,62 +362,64 @@ struct missing_const_const_greater_eq : valid_random_access_iterator __iter' would be invalid: overload resolution selected deleted operator '>'}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because 'std::as_const(__iter) > __iter' would be invalid: overload resolution selected deleted operator '>'}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because '__iter > std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>'}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because 'std::as_const(__iter) > std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>'}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less_eq); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less_eq, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because '__iter <= __iter' would be invalid: overload resolution selected deleted operator '<='}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less_eq); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less_eq, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because 'std::as_const(__iter) <= __iter' would be invalid: overload resolution selected deleted operator '<='}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less_eq); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less_eq, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because '__iter <= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<='}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less_eq); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less_eq, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because 'std::as_const(__iter) <= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<='}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater_eq); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater_eq, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because '__iter >= __iter' would be invalid: overload resolution selected deleted operator '>='}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater_eq); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater_eq, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because 'std::as_const(__iter) >= __iter' would be invalid: overload resolution selected deleted operator '>='}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater_eq); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater_eq, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because '__iter >= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>='}} - _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater_eq); // expected-error {{static assertion failed}} + _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater_eq, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{because 'std::as_const(__iter) >= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>='}} + // clang-format on } diff --git a/libcxx/test/libcxx/algorithms/pstl.iterator-requirements.verify.cpp b/libcxx/test/libcxx/algorithms/pstl.iterator-requirements.verify.cpp new file mode 100644 index 00000000000000..98e3509752e165 --- /dev/null +++ b/libcxx/test/libcxx/algorithms/pstl.iterator-requirements.verify.cpp @@ -0,0 +1,192 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 +// REQUIRES: stdlib=libc++ +// UNSUPPORTED: libcpp-has-no-incomplete-pstl + +// +// + +// Make sure that all PSTL algorithms contain checks for iterator requirements. +// This is not a requirement from the Standard, but we strive to catch misuse in +// the PSTL both because we can, and because iterator category mistakes in the +// PSTL can lead to subtle bugs. + +// Ignore spurious errors after the initial static_assert failure. +// ADDITIONAL_COMPILE_FLAGS: -Xclang -verify-ignore-unexpected=error + +// We only diagnose this in C++20 and above because we implement the checks with concepts. +// UNSUPPORTED: c++17 + +#include +#include +#include + +#include "test_iterators.h" + +using non_forward_iterator = cpp17_input_iterator; +struct non_output_iterator : forward_iterator { + constexpr int const& operator*() const; // prevent it from being an output iterator +}; + +void f(non_forward_iterator non_fwd, non_output_iterator non_output, std::execution::sequenced_policy pol) { + auto pred = [](auto&&...) -> bool { return true; }; + auto func = [](auto&&...) -> int { return 1; }; + int* it = nullptr; + int* out = nullptr; + std::size_t n = 0; + int val = 0; + + { + (void)std::any_of(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: any_of}} + (void)std::all_of(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: all_of}} + (void)std::none_of(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: none_of}} + } + + { + (void)std::copy(pol, non_fwd, non_fwd, it); // expected-error@*:* {{static assertion failed: copy}} + (void)std::copy(pol, it, it, non_fwd); // expected-error@*:* {{static assertion failed: copy}} + (void)std::copy(pol, it, it, non_output); // expected-error@*:* {{static assertion failed: copy}} + } + { + (void)std::copy_n(pol, non_fwd, n, it); // expected-error@*:* {{static assertion failed: copy_n}} + (void)std::copy_n(pol, it, n, non_fwd); // expected-error@*:* {{static assertion failed: copy_n}} + (void)std::copy_n(pol, it, n, non_output); // expected-error@*:* {{static assertion failed: copy_n}} + } + + { + (void)std::count(pol, non_fwd, non_fwd, val); // expected-error@*:* {{static assertion failed: count}} + (void)std::count_if(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: count_if}} + } + + { + (void)std::equal(pol, non_fwd, non_fwd, it); // expected-error@*:* {{static assertion failed: equal}} + (void)std::equal(pol, it, it, non_fwd); // expected-error@*:* {{static assertion failed: equal}} + (void)std::equal(pol, non_fwd, non_fwd, it, pred); // expected-error@*:* {{static assertion failed: equal}} + (void)std::equal(pol, it, it, non_fwd, pred); // expected-error@*:* {{static assertion failed: equal}} + + (void)std::equal(pol, non_fwd, non_fwd, it, it); // expected-error@*:* {{static assertion failed: equal}} + (void)std::equal(pol, it, it, non_fwd, non_fwd); // expected-error@*:* {{static assertion failed: equal}} + (void)std::equal(pol, non_fwd, non_fwd, it, it, pred); // expected-error@*:* {{static assertion failed: equal}} + (void)std::equal(pol, it, it, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: equal}} + } + + { + (void)std::fill(pol, non_fwd, non_fwd, val); // expected-error@*:* {{static assertion failed: fill}} + (void)std::fill_n(pol, non_fwd, n, val); // expected-error@*:* {{static assertion failed: fill_n}} + } + + { + (void)std::find(pol, non_fwd, non_fwd, val); // expected-error@*:* {{static assertion failed: find}} + (void)std::find_if(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: find_if}} + (void)std::find_if_not(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: find_if_not}} + } + + { + (void)std::for_each(pol, non_fwd, non_fwd, func); // expected-error@*:* {{static assertion failed: for_each}} + (void)std::for_each_n(pol, non_fwd, n, func); // expected-error@*:* {{static assertion failed: for_each_n}} + } + + { + (void)std::generate(pol, non_fwd, non_fwd, func); // expected-error@*:* {{static assertion failed: generate}} + (void)std::generate_n(pol, non_fwd, n, func); // expected-error@*:* {{static assertion failed: generate_n}} + } + + { + (void)std::is_partitioned( + pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: is_partitioned}} + } + + { + (void)std::merge(pol, non_fwd, non_fwd, it, it, out); // expected-error@*:* {{static assertion failed: merge}} + (void)std::merge(pol, it, it, non_fwd, non_fwd, out); // expected-error@*:* {{static assertion failed: merge}} + (void)std::merge(pol, it, it, it, it, non_output); // expected-error@*:* {{static assertion failed: merge}} + + (void)std::merge(pol, non_fwd, non_fwd, it, it, out, pred); // expected-error@*:* {{static assertion failed: merge}} + (void)std::merge(pol, it, it, non_fwd, non_fwd, out, pred); // expected-error@*:* {{static assertion failed: merge}} + (void)std::merge(pol, it, it, it, it, non_output, pred); // expected-error@*:* {{static assertion failed: merge}} + } + + { + (void)std::move(pol, non_fwd, non_fwd, out); // expected-error@*:* {{static assertion failed: move}} + (void)std::move(pol, it, it, non_fwd); // expected-error@*:* {{static assertion failed: move}} + (void)std::move(pol, it, it, non_output); // expected-error@*:* {{static assertion failed: move}} + } + + { + (void)std::replace_if( + pol, non_fwd, non_fwd, pred, val); // expected-error@*:* {{static assertion failed: replace_if}} + (void)std::replace(pol, non_fwd, non_fwd, val, val); // expected-error@*:* {{static assertion failed: replace}} + + (void)std::replace_copy_if( + pol, non_fwd, non_fwd, out, pred, val); // expected-error@*:* {{static assertion failed: replace_copy_if}} + (void)std::replace_copy_if( + pol, it, it, non_fwd, pred, val); // expected-error@*:* {{static assertion failed: replace_copy_if}} + (void)std::replace_copy_if( + pol, it, it, non_output, pred, val); // expected-error@*:* {{static assertion failed: replace_copy_if}} + + (void)std::replace_copy( + pol, non_fwd, non_fwd, out, val, val); // expected-error@*:* {{static assertion failed: replace_copy}} + (void)std::replace_copy( + pol, it, it, non_fwd, val, val); // expected-error@*:* {{static assertion failed: replace_copy}} + (void)std::replace_copy( + pol, it, it, non_output, val, val); // expected-error@*:* {{static assertion failed: replace_copy}} + } + + { + (void)std::rotate_copy( + pol, non_fwd, non_fwd, non_fwd, out); // expected-error@*:* {{static assertion failed: rotate_copy}} + (void)std::rotate_copy(pol, it, it, it, non_fwd); // expected-error@*:* {{static assertion failed: rotate_copy}} + (void)std::rotate_copy(pol, it, it, it, non_output); // expected-error@*:* {{static assertion failed: rotate_copy}} + } + + { + (void)std::sort(pol, non_fwd, non_fwd); // expected-error@*:* {{static assertion failed: sort}} + (void)std::sort(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: sort}} + } + + { + (void)std::stable_sort(pol, non_fwd, non_fwd); // expected-error@*:* {{static assertion failed: stable_sort}} + (void)std::stable_sort(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: stable_sort}} + } + + { + (void)std::transform(pol, non_fwd, non_fwd, out, func); // expected-error@*:* {{static assertion failed: transform}} + (void)std::transform(pol, it, it, non_fwd, func); // expected-error@*:* {{static assertion failed: transform}} + (void)std::transform(pol, it, it, non_output, func); // expected-error@*:* {{static assertion failed: transform}} + + (void)std::transform( + pol, non_fwd, non_fwd, it, out, func); // expected-error@*:* {{static assertion failed: transform}} + (void)std::transform(pol, it, it, non_fwd, out, func); // expected-error@*:* {{static assertion failed: transform}} + (void)std::transform(pol, it, it, it, non_fwd, func); // expected-error@*:* {{static assertion failed: transform}} + (void)std::transform( + pol, it, it, it, non_output, func); // expected-error@*:* {{static assertion failed: transform}} + } + + { + (void)std::reduce(pol, non_fwd, non_fwd); // expected-error@*:* {{static assertion failed: reduce}} + (void)std::reduce(pol, non_fwd, non_fwd, val); // expected-error@*:* {{static assertion failed: reduce}} + (void)std::reduce(pol, non_fwd, non_fwd, val, func); // expected-error@*:* {{static assertion failed: reduce}} + } + + { + (void)std::transform_reduce( + pol, non_fwd, non_fwd, it, val); // expected-error@*:* {{static assertion failed: transform_reduce}} + (void)std::transform_reduce( + pol, it, it, non_fwd, val); // expected-error@*:* {{static assertion failed: transform_reduce}} + + (void)std::transform_reduce( + pol, non_fwd, non_fwd, it, val, func, func); // expected-error@*:* {{static assertion failed: transform_reduce}} + (void)std::transform_reduce( + pol, it, it, non_fwd, val, func, func); // expected-error@*:* {{static assertion failed: transform_reduce}} + + (void)std::transform_reduce( + pol, non_fwd, non_fwd, val, func, func); // expected-error@*:* {{static assertion failed: transform_reduce}} + } +} diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h index 7ffb74990fa4dd..aa819ecd4733bd 100644 --- a/libcxx/test/support/test_iterators.h +++ b/libcxx/test/support/test_iterators.h @@ -1484,9 +1484,14 @@ class iterator_wrapper { return tmp; } - iterator_wrapper& operator+=(difference_type i) { + Derived& operator+=(difference_type i) { iter_ += i; - return *this; + return static_cast(*this); + } + + Derived& operator-=(difference_type i) { + iter_ -= i; + return static_cast(*this); } friend decltype(iter_ - iter_) operator-(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { @@ -1503,8 +1508,15 @@ class iterator_wrapper { return iter; } + friend Derived operator+(difference_type i, Derived iter) { return iter + i; } + friend bool operator==(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ == rhs.iter_; } friend bool operator!=(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ != rhs.iter_; } + + friend bool operator>(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ > rhs.iter_; } + friend bool operator<(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ < rhs.iter_; } + friend bool operator<=(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ <= rhs.iter_; } + friend bool operator>=(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ >= rhs.iter_; } }; class iterator_error : std::runtime_error { From 6c7853080451a7f9ba612f57b41a076061b2a1a9 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 17 Apr 2024 13:18:55 +0100 Subject: [PATCH 243/300] [X86] vector-shuffle-combining-sse41.ll - add missing AVX1/2/512 check prefixes --- .../X86/vector-shuffle-combining-sse41.ll | 133 +++++++++++++++++- 1 file changed, 129 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index 33851f56fe8de5..8d213d25774338 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 ; Combine tests involving SSE41 target shuffles (BLEND,INSERTPS,MOVZX) @@ -29,6 +29,30 @@ define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1) ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] ; SSE-NEXT: retq +; +; AVX1-LABEL: combine_blend_of_permutes_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_blend_of_permutes_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq +; +; AVX512-LABEL: combine_blend_of_permutes_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,19,0,17] +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %s0 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> %s1 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> %x0 = bitcast <2 x i64> %s0 to <4 x i32> @@ -71,6 +95,107 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq +; +; AVX1-LABEL: PR50049: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u] +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa (%rsi), %xmm2 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX1-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR50049: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u] +; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rsi), %xmm2 +; AVX2-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX2-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX2-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: PR50049: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u] +; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX512-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX512-NEXT: vpor %xmm5, %xmm2, %xmm2 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x1 = load <48 x i8>, ptr %p1, align 16 %x2 = load <48 x i8>, ptr %p2, align 16 %s1 = shufflevector <48 x i8> %x1, <48 x i8> poison, <16 x i32> From 37b26bf48b9894ed0c13fd1aede23472660fb75e Mon Sep 17 00:00:00 2001 From: "Oleksandr \"Alex\" Zinenko" Date: Wed, 17 Apr 2024 14:24:51 +0200 Subject: [PATCH 244/300] [mlir] transform.apply_patterns support more config options (#88484) Greedy rewrite driver has options to control the number of rewrites applies. Expose those via the corresponding transform op. --- .../mlir/Dialect/Transform/IR/TransformOps.td | 5 +++- .../lib/Dialect/Transform/IR/TransformOps.cpp | 7 +++++ .../Transform/test-pattern-application.mlir | 30 +++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td index 21c9595860d4c5..fbac1ffb621fd2 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td @@ -331,7 +331,10 @@ def ApplyPatternsOp : TransformDialectOp<"apply_patterns", }]; let arguments = (ins - TransformHandleTypeInterface:$target, UnitAttr:$apply_cse); + TransformHandleTypeInterface:$target, + UnitAttr:$apply_cse, + DefaultValuedAttr(-1)">:$max_iterations, + DefaultValuedAttr(-1)">:$max_num_rewrites); let results = (outs); let regions = (region MaxSizedRegion<1>:$patterns); diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp index dc19022219e5b2..53f958caa0bdb7 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp @@ -396,6 +396,13 @@ DiagnosedSilenceableFailure transform::ApplyPatternsOp::applyToOne( static_cast(rewriter.getListener()); FrozenRewritePatternSet frozenPatterns(std::move(patterns)); + config.maxIterations = getMaxIterations() == static_cast(-1) + ? GreedyRewriteConfig::kNoLimit + : getMaxIterations(); + config.maxNumRewrites = getMaxNumRewrites() == static_cast(-1) + ? GreedyRewriteConfig::kNoLimit + : getMaxNumRewrites(); + // Apply patterns and CSE repetitively until a fixpoint is reached. If no CSE // was requested, apply the greedy pattern rewrite only once. (The greedy // pattern rewrite driver already iterates to a fixpoint internally.) diff --git a/mlir/test/Dialect/Transform/test-pattern-application.mlir b/mlir/test/Dialect/Transform/test-pattern-application.mlir index fa8a555af92188..f78b4b6f6798c5 100644 --- a/mlir/test/Dialect/Transform/test-pattern-application.mlir +++ b/mlir/test/Dialect/Transform/test-pattern-application.mlir @@ -26,6 +26,36 @@ module attributes {transform.with_named_sequence} { // ----- +// CHECK-LABEL: @limited_updates +func.func @limited_updates() { + "test.container"() ({ + // Only one is replaced. + // CHECK: "test.foo"() {replace_with_new_op = "test.foo"} + // CHECK: "test.foo"() : () + %0 = "test.foo"() {replace_with_new_op = "test.foo"} : () -> (i32) + %1 = "test.foo"() {replace_with_new_op = "test.foo"} : () -> (i32) + }) : () -> () + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op) { + // Pattern application will fail because of the upper limit, wrap in + // sequence to suppress the error message. + transform.sequence %arg0 : !transform.any_op failures(suppress) { + ^bb0(%arg1: !transform.any_op): + %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %0 { + transform.apply_patterns.transform.test_patterns + } {max_num_rewrites = 1} : !transform.any_op + } + transform.yield + } +} + +// ----- + func.func @replacement_op_not_found() { "test.container"() ({ // expected-note @below {{[0] replaced op}} From 4536ad47579d8d61f372ab85128bcfaed58a1256 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 17 Apr 2024 20:24:59 +0800 Subject: [PATCH 245/300] [RISCV] Fix clang-tidy warning about else after return. NFC --- llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 6e45f0c703ceb8..aab91adbb64be4 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -1551,7 +1551,9 @@ void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) { ToDelete.push_back(&MI); // Leave NextMI unchanged continue; - } else if (canMutatePriorConfig(MI, *NextMI, Used, *MRI)) { + } + + if (canMutatePriorConfig(MI, *NextMI, Used, *MRI)) { if (!isVLPreservingConfig(*NextMI)) { MI.getOperand(0).setReg(NextMI->getOperand(0).getReg()); MI.getOperand(0).setIsDead(false); From 86a78284e7ce2ecc7a9283c7d141566a32371492 Mon Sep 17 00:00:00 2001 From: Quentin Dian Date: Wed, 17 Apr 2024 20:27:09 +0800 Subject: [PATCH 246/300] [TailDuplicator] Add maximum predecessors and successors to consider tail duplicating blocks (#78582) Fixes #78578. Duplicating a BB which has both multiple predecessors and successors will result in a complex CFG and also may cause huge amount of PHI nodes. See https://github.com/llvm/llvm-project/issues/78578#issuecomment-1962363580 for a detailed description of the limit. --- llvm/lib/CodeGen/TailDuplicator.cpp | 20 ++ .../CodeGen/X86/tail-dup-pred-succ-size.mir | 260 ++++++++++++++++++ 2 files changed, 280 insertions(+) create mode 100644 llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index 5ed67bd0a121ed..f5dd21cb927012 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -68,6 +68,18 @@ static cl::opt TailDupIndirectBranchSize( "end with indirect branches."), cl::init(20), cl::Hidden); +static cl::opt + TailDupPredSize("tail-dup-pred-size", + cl::desc("Maximum predecessors (maximum successors at the " + "same time) to consider tail duplicating blocks."), + cl::init(16), cl::Hidden); + +static cl::opt + TailDupSuccSize("tail-dup-succ-size", + cl::desc("Maximum successors (maximum predecessors at the " + "same time) to consider tail duplicating blocks."), + cl::init(16), cl::Hidden); + static cl::opt TailDupVerify("tail-dup-verify", cl::desc("Verify sanity of PHI instructions during taildup"), @@ -565,6 +577,14 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple, if (TailBB.isSuccessor(&TailBB)) return false; + // Duplicating a BB which has both multiple predecessors and successors will + // result in a complex CFG and also may cause huge amount of PHI nodes. If we + // want to remove this limitation, we have to address + // https://github.com/llvm/llvm-project/issues/78578. + if (TailBB.pred_size() > TailDupPredSize && + TailBB.succ_size() > TailDupSuccSize) + return false; + // Set the limit on the cost to duplicate. When optimizing for size, // duplicate only one, because one branch instruction can be eliminated to // compensate for the duplication. diff --git a/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir b/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir new file mode 100644 index 00000000000000..67f8cc72e0d726 --- /dev/null +++ b/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir @@ -0,0 +1,260 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=3 -tail-dup-succ-size=3 %s -o - | FileCheck %s -check-prefix=LIMIT +# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=4 -tail-dup-succ-size=4 %s -o - | FileCheck %s -check-prefix=NOLIMIT + +--- +name: foo +tracksRegLiveness: true +jumpTable: + kind: block-address + entries: + - id: 0 + blocks: [ '%bb.2', '%bb.3', '%bb.4', '%bb.5' ] + - id: 1 + blocks: [ '%bb.9', '%bb.10', '%bb.11', '%bb.12' ] +body: | + ; LIMIT-LABEL: name: foo + ; LIMIT: bb.0: + ; LIMIT-NEXT: successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000) + ; LIMIT-NEXT: liveins: $rdi, $esi + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $esi + ; LIMIT-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi + ; LIMIT-NEXT: [[SHR32ri:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 1, implicit-def dead $eflags + ; LIMIT-NEXT: [[AND32ri:%[0-9]+]]:gr32 = AND32ri [[SHR32ri]], 7, implicit-def dead $eflags + ; LIMIT-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, killed [[AND32ri]], %subreg.sub_32bit + ; LIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG]], %jump-table.0, $noreg + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.2: + ; LIMIT-NEXT: successors: %bb.7(0x80000000) + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; LIMIT-NEXT: JMP_1 %bb.7 + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.3: + ; LIMIT-NEXT: successors: %bb.7(0x80000000) + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; LIMIT-NEXT: [[SHR32ri1:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm1]], 1, implicit-def dead $eflags + ; LIMIT-NEXT: JMP_1 %bb.7 + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.4: + ; LIMIT-NEXT: successors: %bb.7(0x80000000) + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; LIMIT-NEXT: [[SHR32ri2:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm2]], 2, implicit-def dead $eflags + ; LIMIT-NEXT: JMP_1 %bb.7 + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.5: + ; LIMIT-NEXT: successors: %bb.7(0x80000000) + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: [[MOV32rm3:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; LIMIT-NEXT: [[SHR32ri3:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm3]], 3, implicit-def dead $eflags + ; LIMIT-NEXT: JMP_1 %bb.7 + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.6: + ; LIMIT-NEXT: successors: + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.7: + ; LIMIT-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: [[PHI:%[0-9]+]]:gr32 = PHI [[SHR32ri3]], %bb.5, [[SHR32ri2]], %bb.4, [[SHR32ri1]], %bb.3, [[MOV32rm]], %bb.2 + ; LIMIT-NEXT: [[SHR32ri4:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags + ; LIMIT-NEXT: [[AND32ri1:%[0-9]+]]:gr32 = AND32ri [[SHR32ri4]], 7, implicit-def dead $eflags + ; LIMIT-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, killed [[AND32ri1]], %subreg.sub_32bit + ; LIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG1]], %jump-table.1, $noreg + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.9: + ; LIMIT-NEXT: successors: %bb.13(0x80000000) + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; LIMIT-NEXT: JMP_1 %bb.13 + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.10: + ; LIMIT-NEXT: successors: %bb.13(0x80000000) + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; LIMIT-NEXT: [[SHR32ri5:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm5]], 1, implicit-def dead $eflags + ; LIMIT-NEXT: JMP_1 %bb.13 + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.11: + ; LIMIT-NEXT: successors: %bb.13(0x80000000) + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; LIMIT-NEXT: [[SHR32ri6:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm6]], 2, implicit-def dead $eflags + ; LIMIT-NEXT: JMP_1 %bb.13 + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.12: + ; LIMIT-NEXT: successors: %bb.13(0x80000000) + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; LIMIT-NEXT: [[SHR32ri7:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm7]], 6, implicit-def dead $eflags + ; LIMIT-NEXT: {{ $}} + ; LIMIT-NEXT: bb.13: + ; LIMIT-NEXT: [[PHI1:%[0-9]+]]:gr32 = PHI [[SHR32ri7]], %bb.12, [[SHR32ri6]], %bb.11, [[SHR32ri5]], %bb.10, [[MOV32rm4]], %bb.9 + ; LIMIT-NEXT: [[OR32rr:%[0-9]+]]:gr32 = OR32rr [[PHI1]], [[PHI]], implicit-def dead $eflags + ; LIMIT-NEXT: $eax = COPY [[OR32rr]] + ; LIMIT-NEXT: RET 0, $eax + ; + ; NOLIMIT-LABEL: name: foo + ; NOLIMIT: bb.0: + ; NOLIMIT-NEXT: successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000) + ; NOLIMIT-NEXT: liveins: $rdi, $esi + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $esi + ; NOLIMIT-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi + ; NOLIMIT-NEXT: [[SHR32ri:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 1, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[AND32ri:%[0-9]+]]:gr32 = AND32ri [[SHR32ri]], 7, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, killed [[AND32ri]], %subreg.sub_32bit + ; NOLIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG]], %jump-table.0, $noreg + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: bb.2: + ; NOLIMIT-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; NOLIMIT-NEXT: [[SHR32ri1:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[AND32ri1:%[0-9]+]]:gr32 = AND32ri [[SHR32ri1]], 7, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri1]], %subreg.sub_32bit + ; NOLIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG1]], %jump-table.1, $noreg + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: bb.3: + ; NOLIMIT-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; NOLIMIT-NEXT: [[SHR32ri2:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm1]], 1, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[SHR32ri3:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[AND32ri2:%[0-9]+]]:gr32 = AND32ri [[SHR32ri3]], 7, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri2]], %subreg.sub_32bit + ; NOLIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG2]], %jump-table.1, $noreg + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: bb.4: + ; NOLIMIT-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; NOLIMIT-NEXT: [[SHR32ri4:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm2]], 2, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[SHR32ri5:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[AND32ri3:%[0-9]+]]:gr32 = AND32ri [[SHR32ri5]], 7, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri3]], %subreg.sub_32bit + ; NOLIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG3]], %jump-table.1, $noreg + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: bb.5: + ; NOLIMIT-NEXT: successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000) + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: [[MOV32rm3:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; NOLIMIT-NEXT: [[SHR32ri6:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm3]], 3, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[SHR32ri7:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[AND32ri4:%[0-9]+]]:gr32 = AND32ri [[SHR32ri7]], 7, implicit-def dead $eflags + ; NOLIMIT-NEXT: [[SUBREG_TO_REG4:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri4]], %subreg.sub_32bit + ; NOLIMIT-NEXT: JMP64m $noreg, 8, [[SUBREG_TO_REG4]], %jump-table.1, $noreg + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: bb.6: + ; NOLIMIT-NEXT: successors: + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: bb.9: + ; NOLIMIT-NEXT: successors: %bb.13(0x80000000) + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: [[PHI:%[0-9]+]]:gr32 = PHI [[MOV32rm]], %bb.2, [[SHR32ri2]], %bb.3, [[SHR32ri4]], %bb.4, [[SHR32ri6]], %bb.5 + ; NOLIMIT-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; NOLIMIT-NEXT: JMP_1 %bb.13 + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: bb.10: + ; NOLIMIT-NEXT: successors: %bb.13(0x80000000) + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: [[PHI1:%[0-9]+]]:gr32 = PHI [[MOV32rm]], %bb.2, [[SHR32ri2]], %bb.3, [[SHR32ri4]], %bb.4, [[SHR32ri6]], %bb.5 + ; NOLIMIT-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; NOLIMIT-NEXT: [[SHR32ri8:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm5]], 1, implicit-def dead $eflags + ; NOLIMIT-NEXT: JMP_1 %bb.13 + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: bb.11: + ; NOLIMIT-NEXT: successors: %bb.13(0x80000000) + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: [[PHI2:%[0-9]+]]:gr32 = PHI [[MOV32rm]], %bb.2, [[SHR32ri2]], %bb.3, [[SHR32ri4]], %bb.4, [[SHR32ri6]], %bb.5 + ; NOLIMIT-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; NOLIMIT-NEXT: [[SHR32ri9:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm6]], 2, implicit-def dead $eflags + ; NOLIMIT-NEXT: JMP_1 %bb.13 + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: bb.12: + ; NOLIMIT-NEXT: successors: %bb.13(0x80000000) + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: [[PHI3:%[0-9]+]]:gr32 = PHI [[MOV32rm]], %bb.2, [[SHR32ri2]], %bb.3, [[SHR32ri4]], %bb.4, [[SHR32ri6]], %bb.5 + ; NOLIMIT-NEXT: [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg + ; NOLIMIT-NEXT: [[SHR32ri10:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm7]], 6, implicit-def dead $eflags + ; NOLIMIT-NEXT: {{ $}} + ; NOLIMIT-NEXT: bb.13: + ; NOLIMIT-NEXT: [[PHI4:%[0-9]+]]:gr32 = PHI [[PHI]], %bb.9, [[PHI1]], %bb.10, [[PHI2]], %bb.11, [[PHI3]], %bb.12 + ; NOLIMIT-NEXT: [[PHI5:%[0-9]+]]:gr32 = PHI [[SHR32ri10]], %bb.12, [[SHR32ri9]], %bb.11, [[SHR32ri8]], %bb.10, [[MOV32rm4]], %bb.9 + ; NOLIMIT-NEXT: [[OR32rr:%[0-9]+]]:gr32 = OR32rr [[PHI5]], [[PHI4]], implicit-def dead $eflags + ; NOLIMIT-NEXT: $eax = COPY [[OR32rr]] + ; NOLIMIT-NEXT: RET 0, $eax + bb.0: + liveins: $rdi, $esi + + %11:gr32 = COPY $esi + %10:gr64 = COPY $rdi + %13:gr32 = SHR32ri %11, 1, implicit-def dead $eflags + %14:gr32 = AND32ri %13, 7, implicit-def dead $eflags + %12:gr64_nosp = SUBREG_TO_REG 0, killed %14, %subreg.sub_32bit + + bb.1: + successors: %bb.2, %bb.3, %bb.4, %bb.5 + + JMP64m $noreg, 8, %12, %jump-table.0, $noreg + + bb.2: + %0:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg + JMP_1 %bb.7 + + bb.3: + %17:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg + %1:gr32 = SHR32ri %17, 1, implicit-def dead $eflags + JMP_1 %bb.7 + + bb.4: + %16:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg + %2:gr32 = SHR32ri %16, 2, implicit-def dead $eflags + JMP_1 %bb.7 + + bb.5: + %15:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg + %3:gr32 = SHR32ri %15, 3, implicit-def dead $eflags + JMP_1 %bb.7 + + bb.6: + successors: + + bb.7: + %4:gr32 = PHI %3, %bb.5, %2, %bb.4, %1, %bb.3, %0, %bb.2 + %19:gr32 = SHR32ri %11, 2, implicit-def dead $eflags + %20:gr32 = AND32ri %19, 7, implicit-def dead $eflags + %18:gr64_nosp = SUBREG_TO_REG 0, killed %20, %subreg.sub_32bit + + bb.8: + successors: %bb.9, %bb.10, %bb.11, %bb.12 + + JMP64m $noreg, 8, %18, %jump-table.1, $noreg + + bb.9: + %5:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg + JMP_1 %bb.13 + + bb.10: + %23:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg + %6:gr32 = SHR32ri %23, 1, implicit-def dead $eflags + JMP_1 %bb.13 + + bb.11: + %22:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg + %7:gr32 = SHR32ri %22, 2, implicit-def dead $eflags + JMP_1 %bb.13 + + bb.12: + %21:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg + %8:gr32 = SHR32ri %21, 6, implicit-def dead $eflags + + bb.13: + %9:gr32 = PHI %8, %bb.12, %7, %bb.11, %6, %bb.10, %5, %bb.9 + %24:gr32 = OR32rr %9, %4, implicit-def dead $eflags + $eax = COPY %24 + RET 0, $eax + +... From 915c84b1480bb3c6d2e44ca83822d2c2304b763a Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Wed, 17 Apr 2024 14:33:18 +0200 Subject: [PATCH 247/300] [lldb] Fix evaluation of expressions with static initializers (#89063) After 281d71604f418eb952e967d9dc4b26241b7f96a, llvm generates 32-bit relocations, which overflow when we load these objects into high memory. Interestingly, setting the code model to "large" does not help here (perhaps it is the default?). I'm not completely sure that this is the right thing to do, but it doesn't seem to cause any ill effects. I'll follow up with the author of that patch about the expected behavior here. --- lldb/source/Expression/IRExecutionUnit.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp index cb9bee8733e15d..7ad0e5ff22b2f6 100644 --- a/lldb/source/Expression/IRExecutionUnit.cpp +++ b/lldb/source/Expression/IRExecutionUnit.cpp @@ -13,6 +13,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" @@ -279,10 +280,13 @@ void IRExecutionUnit::GetRunnableInfo(Status &error, lldb::addr_t &func_addr, llvm::EngineBuilder builder(std::move(m_module_up)); llvm::Triple triple(m_module->getTargetTriple()); + // PIC needed for ELF to avoid generating 32-bit relocations (which overflow + // if the object is loaded into high memory). + bool want_pic = triple.isOSBinFormatMachO() || triple.isOSBinFormatELF(); + builder.setEngineKind(llvm::EngineKind::JIT) .setErrorStr(&error_string) - .setRelocationModel(triple.isOSBinFormatMachO() ? llvm::Reloc::PIC_ - : llvm::Reloc::Static) + .setRelocationModel(want_pic ? llvm::Reloc::PIC_ : llvm::Reloc::Static) .setMCJITMemoryManager(std::make_unique(*this)) .setOptLevel(llvm::CodeGenOptLevel::Less); From 79726ef5d20f3bf6510b67d7dd3378b8a41db768 Mon Sep 17 00:00:00 2001 From: "Kevin P. Neal" Date: Wed, 17 Apr 2024 08:34:25 -0400 Subject: [PATCH 248/300] [VP] Correct lowering of predicated fma and faddmul to avoid strictfp. (#85272) Correct missing cases in a switch that result in @llvm.vp.fma.v4f32 getting lowered to a constrained fma intrinsic. Vector predicated lowering to contrained intrinsics is not supported currently, and there's no consensus on the path forward. We certainly shouldn't be introducing constrained intrinsics into a function that isn't strictfp. Problem found with D146845. --- llvm/include/llvm/IR/Intrinsics.h | 4 + llvm/lib/CodeGen/ExpandVectorPredication.cpp | 12 +- llvm/lib/IR/Function.cpp | 22 ++- .../Generic/expand-vp-fp-intrinsics.ll | 176 ++++++++++++++++++ 4 files changed, 203 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/Generic/expand-vp-fp-intrinsics.ll diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h index 0dfe9f029f9b1a..92eae344ce729e 100644 --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -105,6 +105,10 @@ namespace Intrinsic { /// Map a MS builtin name to an intrinsic ID. ID getIntrinsicForMSBuiltin(const char *Prefix, StringRef BuiltinName); + /// Returns true if the intrinsic ID is for one of the "Constrained + /// Floating-Point Intrinsics". + bool isConstrainedFPIntrinsic(ID QID); + /// This is a type descriptor which explains the type requirements of an /// intrinsic. This is returned by getIntrinsicInfoTableEntries. struct IITDescriptor { diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index 0fe4cfefdb1600..8e623c85b737b0 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -340,6 +340,8 @@ Value *CachingVPExpander::expandPredicationToFPCall( replaceOperation(*NewOp, VPI); return NewOp; } + case Intrinsic::fma: + case Intrinsic::fmuladd: case Intrinsic::experimental_constrained_fma: case Intrinsic::experimental_constrained_fmuladd: { Value *Op0 = VPI.getOperand(0); @@ -347,8 +349,12 @@ Value *CachingVPExpander::expandPredicationToFPCall( Value *Op2 = VPI.getOperand(2); Function *Fn = Intrinsic::getDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); - Value *NewOp = - Builder.CreateConstrainedFPCall(Fn, {Op0, Op1, Op2}, VPI.getName()); + Value *NewOp; + if (Intrinsic::isConstrainedFPIntrinsic(UnpredicatedIntrinsicID)) + NewOp = + Builder.CreateConstrainedFPCall(Fn, {Op0, Op1, Op2}, VPI.getName()); + else + NewOp = Builder.CreateCall(Fn, {Op0, Op1, Op2}, VPI.getName()); replaceOperation(*NewOp, VPI); return NewOp; } @@ -731,6 +737,8 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) { case Intrinsic::vp_minnum: case Intrinsic::vp_maximum: case Intrinsic::vp_minimum: + case Intrinsic::vp_fma: + case Intrinsic::vp_fmuladd: return expandPredicationToFPCall(Builder, VPI, VPI.getFunctionalIntrinsicID().value()); case Intrinsic::vp_load: diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 96953ac49c19b4..818a167560de69 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -499,15 +499,7 @@ static MutableArrayRef makeArgArray(Argument *Args, size_t Count) { } bool Function::isConstrainedFPIntrinsic() const { - switch (getIntrinsicID()) { -#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ - case Intrinsic::INTRINSIC: -#include "llvm/IR/ConstrainedOps.def" - return true; -#undef INSTRUCTION - default: - return false; - } + return Intrinsic::isConstrainedFPIntrinsic(getIntrinsicID()); } void Function::clearArguments() { @@ -1486,6 +1478,18 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef Tys) { #include "llvm/IR/IntrinsicImpl.inc" #undef GET_LLVM_INTRINSIC_FOR_MS_BUILTIN +bool Intrinsic::isConstrainedFPIntrinsic(ID QID) { + switch (QID) { +#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \ + case Intrinsic::INTRINSIC: +#include "llvm/IR/ConstrainedOps.def" + return true; +#undef INSTRUCTION + default: + return false; + } +} + using DeferredIntrinsicMatchPair = std::pair>; diff --git a/llvm/test/CodeGen/Generic/expand-vp-fp-intrinsics.ll b/llvm/test/CodeGen/Generic/expand-vp-fp-intrinsics.ll new file mode 100644 index 00000000000000..bc89ddea6b85aa --- /dev/null +++ b/llvm/test/CodeGen/Generic/expand-vp-fp-intrinsics.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -expandvp -S < %s | FileCheck %s + +define void @vp_fadd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { +; CHECK-LABEL: define void @vp_fadd_v4f32( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[RES1:%.*]] = fadd <4 x float> [[A0]], [[A1]] +; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> , i32 %vp) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fadd.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @vp_fsub_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { +; CHECK-LABEL: define void @vp_fsub_v4f32( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RES1:%.*]] = fsub <4 x float> [[A0]], [[A1]] +; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> , i32 %vp) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fsub.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @vp_fmul_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { +; CHECK-LABEL: define void @vp_fmul_v4f32( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RES1:%.*]] = fmul <4 x float> [[A0]], [[A1]] +; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> , i32 %vp) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fmul.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @vp_fdiv_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { +; CHECK-LABEL: define void @vp_fdiv_v4f32( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RES1:%.*]] = fdiv <4 x float> [[A0]], [[A1]] +; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> , i32 %vp) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fdiv.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @vp_frem_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { +; CHECK-LABEL: define void @vp_frem_v4f32( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RES1:%.*]] = frem <4 x float> [[A0]], [[A1]] +; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> , i32 %vp) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.frem.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) + +define void @vp_fabs_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { +; CHECK-LABEL: define void @vp_fabs_v4f32( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A0]]) +; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.vp.fabs.v4f32(<4 x float> %a0, <4 x i1> , i32 %vp) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fabs.v4f32(<4 x float>, <4 x i1>, i32) + +define void @vp_sqrt_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { +; CHECK-LABEL: define void @vp_sqrt_v4f32( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[A0]]) +; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.vp.sqrt.v4f32(<4 x float> %a0, <4 x i1> , i32 %vp) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.sqrt.v4f32(<4 x float>, <4 x i1>, i32) + +define void @vp_fneg_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { +; CHECK-LABEL: define void @vp_fneg_v4f32( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RES1:%.*]] = fneg <4 x float> [[A0]] +; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a0, <4 x i1> , i32 %vp) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fneg.v4f32(<4 x float>, <4 x i1>, i32) + +define void @vp_fma_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind { +; CHECK-LABEL: define void @vp_fma_v4f32( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i4 [[A5:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A1]]) +; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> , i32 4) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) + +define void @vp_fmuladd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind { +; CHECK-LABEL: define void @vp_fmuladd_v4f32( +; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i4 [[A5:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A1]]) +; CHECK-NEXT: store <4 x float> [[RES1]], ptr [[OUT]], align 16 +; CHECK-NEXT: ret void +; + %res = call <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> , i32 4) + store <4 x float> %res, ptr %out + ret void +} +declare <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) + +declare <4 x float> @llvm.vp.maxnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) +define <4 x float> @vfmax_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: define <4 x float> @vfmax_vv_v4f32( +; CHECK-SAME: <4 x float> [[VA:%.*]], <4 x float> [[VB:%.*]], <4 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) { +; CHECK-NEXT: [[V1:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VA]], <4 x float> [[VB]]) +; CHECK-NEXT: ret <4 x float> [[V1]] +; + %v = call <4 x float> @llvm.vp.maxnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl) + ret <4 x float> %v +} + +declare <8 x float> @llvm.vp.maxnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) +define <8 x float> @vfmax_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: define <8 x float> @vfmax_vv_v8f32( +; CHECK-SAME: <8 x float> [[VA:%.*]], <8 x float> [[VB:%.*]], <8 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) { +; CHECK-NEXT: [[V1:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[VA]], <8 x float> [[VB]]) +; CHECK-NEXT: ret <8 x float> [[V1]] +; + %v = call <8 x float> @llvm.vp.maxnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl) + ret <8 x float> %v +} + +declare <4 x float> @llvm.vp.minnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) +define <4 x float> @vfmin_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: define <4 x float> @vfmin_vv_v4f32( +; CHECK-SAME: <4 x float> [[VA:%.*]], <4 x float> [[VB:%.*]], <4 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) { +; CHECK-NEXT: [[V1:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VA]], <4 x float> [[VB]]) +; CHECK-NEXT: ret <4 x float> [[V1]] +; + %v = call <4 x float> @llvm.vp.minnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl) + ret <4 x float> %v +} + +declare <8 x float> @llvm.vp.minnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) +define <8 x float> @vfmin_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: define <8 x float> @vfmin_vv_v8f32( +; CHECK-SAME: <8 x float> [[VA:%.*]], <8 x float> [[VB:%.*]], <8 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) { +; CHECK-NEXT: [[V1:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[VA]], <8 x float> [[VB]]) +; CHECK-NEXT: ret <8 x float> [[V1]] +; + %v = call <8 x float> @llvm.vp.minnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl) + ret <8 x float> %v +} From 7b8625ec16efcb6d11e14a80e08c65dbfe68db9f Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 17 Apr 2024 14:54:14 +0200 Subject: [PATCH 249/300] [AMDGPU][Docs] Fix broken link to HRF memory model reference (#88696) The link to the Heterogeneous-race-free Memory Models ASPLOS'14 paper by Hower et al. pointed to a bogus website, probably because the domain ownership has changed. This patch updates it to a version hosted on research.cs.wisc.edu. --- llvm/docs/AMDGPUUsage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 22c1d1f186ea54..7da5d8e41f6f85 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -16024,7 +16024,7 @@ Additional Documentation .. [CLANG-ATTR] `Attributes in Clang `__ .. [DWARF] `DWARF Debugging Information Format `__ .. [ELF] `Executable and Linkable Format (ELF) `__ -.. [HRF] `Heterogeneous-race-free Memory Models `__ +.. [HRF] `Heterogeneous-race-free Memory Models `__ .. [HSA] `Heterogeneous System Architecture (HSA) Foundation `__ .. [MsgPack] `Message Pack `__ .. [OpenCL] `The OpenCL Specification Version 2.0 `__ From 971ec1f0eea324d4a1eec6709e2c97e1798a6002 Mon Sep 17 00:00:00 2001 From: DianQK Date: Wed, 17 Apr 2024 20:54:17 +0800 Subject: [PATCH 250/300] [Inline] Regenerate inline-switch-default-2.ll (NFC) --- .../Inline/inline-switch-default-2.ll | 176 ------------------ 1 file changed, 176 deletions(-) diff --git a/llvm/test/Transforms/Inline/inline-switch-default-2.ll b/llvm/test/Transforms/Inline/inline-switch-default-2.ll index 8d3e24c798df82..82dae1c27648fc 100644 --- a/llvm/test/Transforms/Inline/inline-switch-default-2.ll +++ b/llvm/test/Transforms/Inline/inline-switch-default-2.ll @@ -4,50 +4,6 @@ ; Check for scenarios without TTI. define i64 @foo1(i64 %a) { -; LOOKUPTABLE-LABEL: define i64 @foo1( -; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) { -; LOOKUPTABLE-NEXT: switch i64 [[TMP0]], label [[DEFAULT_BRANCH_I:%.*]] [ -; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0_I:%.*]] -; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2_I:%.*]] -; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4_I:%.*]] -; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6_I:%.*]] -; LOOKUPTABLE-NEXT: ] -; LOOKUPTABLE: branch_0.i: -; LOOKUPTABLE-NEXT: br label [[BAR1_EXIT:%.*]] -; LOOKUPTABLE: branch_2.i: -; LOOKUPTABLE-NEXT: br label [[BAR1_EXIT]] -; LOOKUPTABLE: branch_4.i: -; LOOKUPTABLE-NEXT: br label [[BAR1_EXIT]] -; LOOKUPTABLE: branch_6.i: -; LOOKUPTABLE-NEXT: br label [[BAR1_EXIT]] -; LOOKUPTABLE: default_branch.i: -; LOOKUPTABLE-NEXT: br label [[BAR1_EXIT]] -; LOOKUPTABLE: bar1.exit: -; LOOKUPTABLE-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ], [ 3, [[DEFAULT_BRANCH_I]] ] -; LOOKUPTABLE-NEXT: ret i64 [[TMP2]] -; -; SWITCH-LABEL: define i64 @foo1( -; SWITCH-SAME: i64 [[TMP0:%.*]]) { -; SWITCH-NEXT: switch i64 [[TMP0]], label [[DEFAULT_BRANCH_I:%.*]] [ -; SWITCH-NEXT: i64 0, label [[BRANCH_0_I:%.*]] -; SWITCH-NEXT: i64 2, label [[BRANCH_2_I:%.*]] -; SWITCH-NEXT: i64 4, label [[BRANCH_4_I:%.*]] -; SWITCH-NEXT: i64 6, label [[BRANCH_6_I:%.*]] -; SWITCH-NEXT: ] -; SWITCH: branch_0.i: -; SWITCH-NEXT: br label [[BAR1_EXIT:%.*]] -; SWITCH: branch_2.i: -; SWITCH-NEXT: br label [[BAR1_EXIT]] -; SWITCH: branch_4.i: -; SWITCH-NEXT: br label [[BAR1_EXIT]] -; SWITCH: branch_6.i: -; SWITCH-NEXT: br label [[BAR1_EXIT]] -; SWITCH: default_branch.i: -; SWITCH-NEXT: br label [[BAR1_EXIT]] -; SWITCH: bar1.exit: -; SWITCH-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ], [ 3, [[DEFAULT_BRANCH_I]] ] -; SWITCH-NEXT: ret i64 [[TMP2]] -; ; CHECK-LABEL: define i64 @foo1( ; CHECK-SAME: i64 [[A:%.*]]) { ; CHECK-NEXT: [[B:%.*]] = call i64 @bar1(i64 [[A]]) @@ -58,50 +14,6 @@ define i64 @foo1(i64 %a) { } define i64 @foo2(i64 %a) { -; LOOKUPTABLE-LABEL: define i64 @foo2( -; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) { -; LOOKUPTABLE-NEXT: switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT_I:%.*]] [ -; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0_I:%.*]] -; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2_I:%.*]] -; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4_I:%.*]] -; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6_I:%.*]] -; LOOKUPTABLE-NEXT: ] -; LOOKUPTABLE: branch_0.i: -; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT:%.*]] -; LOOKUPTABLE: branch_2.i: -; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT]] -; LOOKUPTABLE: branch_4.i: -; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT]] -; LOOKUPTABLE: branch_6.i: -; LOOKUPTABLE-NEXT: br label [[BAR2_EXIT]] -; LOOKUPTABLE: unreachabledefault.i: -; LOOKUPTABLE-NEXT: unreachable -; LOOKUPTABLE: bar2.exit: -; LOOKUPTABLE-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ] -; LOOKUPTABLE-NEXT: ret i64 [[TMP2]] -; -; SWITCH-LABEL: define i64 @foo2( -; SWITCH-SAME: i64 [[TMP0:%.*]]) { -; SWITCH-NEXT: switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT_I:%.*]] [ -; SWITCH-NEXT: i64 0, label [[BRANCH_0_I:%.*]] -; SWITCH-NEXT: i64 2, label [[BRANCH_2_I:%.*]] -; SWITCH-NEXT: i64 4, label [[BRANCH_4_I:%.*]] -; SWITCH-NEXT: i64 6, label [[BRANCH_6_I:%.*]] -; SWITCH-NEXT: ] -; SWITCH: branch_0.i: -; SWITCH-NEXT: br label [[BAR2_EXIT:%.*]] -; SWITCH: branch_2.i: -; SWITCH-NEXT: br label [[BAR2_EXIT]] -; SWITCH: branch_4.i: -; SWITCH-NEXT: br label [[BAR2_EXIT]] -; SWITCH: branch_6.i: -; SWITCH-NEXT: br label [[BAR2_EXIT]] -; SWITCH: unreachabledefault.i: -; SWITCH-NEXT: unreachable -; SWITCH: bar2.exit: -; SWITCH-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ] -; SWITCH-NEXT: ret i64 [[TMP2]] -; ; CHECK-LABEL: define i64 @foo2( ; CHECK-SAME: i64 [[A:%.*]]) { ; CHECK-NEXT: switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [ @@ -129,50 +41,6 @@ define i64 @foo2(i64 %a) { } define i64 @bar1(i64 %a) { -; LOOKUPTABLE-LABEL: define i64 @bar1( -; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) { -; LOOKUPTABLE-NEXT: switch i64 [[TMP0]], label [[DEFAULT_BRANCH:%.*]] [ -; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0:%.*]] -; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2:%.*]] -; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4:%.*]] -; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6:%.*]] -; LOOKUPTABLE-NEXT: ] -; LOOKUPTABLE: branch_0: -; LOOKUPTABLE-NEXT: br label [[EXIT:%.*]] -; LOOKUPTABLE: branch_2: -; LOOKUPTABLE-NEXT: br label [[EXIT]] -; LOOKUPTABLE: branch_4: -; LOOKUPTABLE-NEXT: br label [[EXIT]] -; LOOKUPTABLE: branch_6: -; LOOKUPTABLE-NEXT: br label [[EXIT]] -; LOOKUPTABLE: default_branch: -; LOOKUPTABLE-NEXT: br label [[EXIT]] -; LOOKUPTABLE: exit: -; LOOKUPTABLE-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ] -; LOOKUPTABLE-NEXT: ret i64 [[TMP2]] -; -; SWITCH-LABEL: define i64 @bar1( -; SWITCH-SAME: i64 [[TMP0:%.*]]) { -; SWITCH-NEXT: switch i64 [[TMP0]], label [[DEFAULT_BRANCH:%.*]] [ -; SWITCH-NEXT: i64 0, label [[BRANCH_0:%.*]] -; SWITCH-NEXT: i64 2, label [[BRANCH_2:%.*]] -; SWITCH-NEXT: i64 4, label [[BRANCH_4:%.*]] -; SWITCH-NEXT: i64 6, label [[BRANCH_6:%.*]] -; SWITCH-NEXT: ] -; SWITCH: branch_0: -; SWITCH-NEXT: br label [[EXIT:%.*]] -; SWITCH: branch_2: -; SWITCH-NEXT: br label [[EXIT]] -; SWITCH: branch_4: -; SWITCH-NEXT: br label [[EXIT]] -; SWITCH: branch_6: -; SWITCH-NEXT: br label [[EXIT]] -; SWITCH: default_branch: -; SWITCH-NEXT: br label [[EXIT]] -; SWITCH: exit: -; SWITCH-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ] -; SWITCH-NEXT: ret i64 [[TMP2]] -; ; CHECK-LABEL: define i64 @bar1( ; CHECK-SAME: i64 [[A:%.*]]) { ; CHECK-NEXT: switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [ @@ -223,50 +91,6 @@ exit: } define i64 @bar2(i64 %a) { -; LOOKUPTABLE-LABEL: define i64 @bar2( -; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) { -; LOOKUPTABLE-NEXT: switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT:%.*]] [ -; LOOKUPTABLE-NEXT: i64 0, label [[BRANCH_0:%.*]] -; LOOKUPTABLE-NEXT: i64 2, label [[BRANCH_2:%.*]] -; LOOKUPTABLE-NEXT: i64 4, label [[BRANCH_4:%.*]] -; LOOKUPTABLE-NEXT: i64 6, label [[BRANCH_6:%.*]] -; LOOKUPTABLE-NEXT: ] -; LOOKUPTABLE: branch_0: -; LOOKUPTABLE-NEXT: br label [[EXIT:%.*]] -; LOOKUPTABLE: branch_2: -; LOOKUPTABLE-NEXT: br label [[EXIT]] -; LOOKUPTABLE: branch_4: -; LOOKUPTABLE-NEXT: br label [[EXIT]] -; LOOKUPTABLE: branch_6: -; LOOKUPTABLE-NEXT: br label [[EXIT]] -; LOOKUPTABLE: unreachabledefault: -; LOOKUPTABLE-NEXT: unreachable -; LOOKUPTABLE: exit: -; LOOKUPTABLE-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ] -; LOOKUPTABLE-NEXT: ret i64 [[TMP2]] -; -; SWITCH-LABEL: define i64 @bar2( -; SWITCH-SAME: i64 [[TMP0:%.*]]) { -; SWITCH-NEXT: switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT:%.*]] [ -; SWITCH-NEXT: i64 0, label [[BRANCH_0:%.*]] -; SWITCH-NEXT: i64 2, label [[BRANCH_2:%.*]] -; SWITCH-NEXT: i64 4, label [[BRANCH_4:%.*]] -; SWITCH-NEXT: i64 6, label [[BRANCH_6:%.*]] -; SWITCH-NEXT: ] -; SWITCH: branch_0: -; SWITCH-NEXT: br label [[EXIT:%.*]] -; SWITCH: branch_2: -; SWITCH-NEXT: br label [[EXIT]] -; SWITCH: branch_4: -; SWITCH-NEXT: br label [[EXIT]] -; SWITCH: branch_6: -; SWITCH-NEXT: br label [[EXIT]] -; SWITCH: unreachabledefault: -; SWITCH-NEXT: unreachable -; SWITCH: exit: -; SWITCH-NEXT: [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ] -; SWITCH-NEXT: ret i64 [[TMP2]] -; ; CHECK-LABEL: define i64 @bar2( ; CHECK-SAME: i64 [[A:%.*]]) { ; CHECK-NEXT: switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [ From 73140daebbf522dbb14dc4b2f3c67dc0aa1a62dd Mon Sep 17 00:00:00 2001 From: "Oleksandr \"Alex\" Zinenko" Date: Wed, 17 Apr 2024 15:01:59 +0200 Subject: [PATCH 251/300] [mlir] expose transform dialect symbol merge to python (#87690) This functionality is available in C++, make it available in Python directly to operate on transform modules. --- .../mlir-c/Dialect/Transform/Interpreter.h | 12 ++- .../Bindings/Python/TransformInterpreter.cpp | 15 ++++ .../lib/CAPI/Dialect/TransformInterpreter.cpp | 9 +++ .../transform/interpreter/__init__.py | 10 ++- .../python/dialects/transform_interpreter.py | 76 +++++++++++++++++++ 5 files changed, 120 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir-c/Dialect/Transform/Interpreter.h b/mlir/include/mlir-c/Dialect/Transform/Interpreter.h index 00095d5040a0e5..fa320324234e8d 100644 --- a/mlir/include/mlir-c/Dialect/Transform/Interpreter.h +++ b/mlir/include/mlir-c/Dialect/Transform/Interpreter.h @@ -60,7 +60,7 @@ MLIR_CAPI_EXPORTED void mlirTransformOptionsDestroy(MlirTransformOptions transformOptions); //----------------------------------------------------------------------------// -// Transform interpreter. +// Transform interpreter and utilities. //----------------------------------------------------------------------------// /// Applies the transformation script starting at the given transform root @@ -72,6 +72,16 @@ MLIR_CAPI_EXPORTED MlirLogicalResult mlirTransformApplyNamedSequence( MlirOperation payload, MlirOperation transformRoot, MlirOperation transformModule, MlirTransformOptions transformOptions); +/// Merge the symbols from `other` into `target`, potentially renaming them to +/// avoid conflicts. Private symbols may be renamed during the merge, public +/// symbols must have at most one declaration. A name conflict in public symbols +/// is reported as an error before returning a failure. +/// +/// Note that this clones the `other` operation unlike the C++ counterpart that +/// takes ownership. +MLIR_CAPI_EXPORTED MlirLogicalResult +mlirMergeSymbolsIntoFromClone(MlirOperation target, MlirOperation other); + #ifdef __cplusplus } #endif diff --git a/mlir/lib/Bindings/Python/TransformInterpreter.cpp b/mlir/lib/Bindings/Python/TransformInterpreter.cpp index 6517f8c39dfadd..f6b4532b1b6be4 100644 --- a/mlir/lib/Bindings/Python/TransformInterpreter.cpp +++ b/mlir/lib/Bindings/Python/TransformInterpreter.cpp @@ -82,6 +82,21 @@ static void populateTransformInterpreterSubmodule(py::module &m) { py::arg("payload_root"), py::arg("transform_root"), py::arg("transform_module"), py::arg("transform_options") = PyMlirTransformOptions()); + + m.def( + "copy_symbols_and_merge_into", + [](MlirOperation target, MlirOperation other) { + mlir::python::CollectDiagnosticsToStringScope scope( + mlirOperationGetContext(target)); + + MlirLogicalResult result = mlirMergeSymbolsIntoFromClone(target, other); + if (mlirLogicalResultIsFailure(result)) { + throw py::value_error( + "Failed to merge symbols.\nDiagnostic message " + + scope.takeMessage()); + } + }, + py::arg("target"), py::arg("other")); } PYBIND11_MODULE(_mlirTransformInterpreter, m) { diff --git a/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp b/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp index eb6951dc5584d6..145455e1c1b3d2 100644 --- a/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp +++ b/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp @@ -15,6 +15,7 @@ #include "mlir/CAPI/IR.h" #include "mlir/CAPI/Support.h" #include "mlir/CAPI/Wrap.h" +#include "mlir/Dialect/Transform/IR/Utils.h" #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h" @@ -71,4 +72,12 @@ MlirLogicalResult mlirTransformApplyNamedSequence( unwrap(payload), unwrap(transformRoot), cast(unwrap(transformModule)), *unwrap(transformOptions))); } + +MlirLogicalResult mlirMergeSymbolsIntoFromClone(MlirOperation target, + MlirOperation other) { + OwningOpRef otherOwning(unwrap(other)->clone()); + LogicalResult result = transform::detail::mergeSymbolsInto( + unwrap(target), std::move(otherOwning)); + return wrap(result); +} } diff --git a/mlir/python/mlir/dialects/transform/interpreter/__init__.py b/mlir/python/mlir/dialects/transform/interpreter/__init__.py index 6145b99224eb54..34cdc43cb617fd 100644 --- a/mlir/python/mlir/dialects/transform/interpreter/__init__.py +++ b/mlir/python/mlir/dialects/transform/interpreter/__init__.py @@ -5,7 +5,6 @@ from ....ir import Operation from ...._mlir_libs import _mlirTransformInterpreter as _cextTransformInterpreter - TransformOptions = _cextTransformInterpreter.TransformOptions @@ -31,3 +30,12 @@ def apply_named_sequence( _cextTransformInterpreter.apply_named_sequence(*args) else: _cextTransformInterpreter(*args, transform_options) + + +def copy_symbols_and_merge_into(target, other): + """Copies symbols from other into target, renaming private symbols to avoid + duplicates. Raises an error if copying would lead to duplicate public + symbols.""" + _cextTransformInterpreter.copy_symbols_and_merge_into( + _unpack_operation(target), _unpack_operation(other) + ) diff --git a/mlir/test/python/dialects/transform_interpreter.py b/mlir/test/python/dialects/transform_interpreter.py index 740c49f76a26c4..807a98c4932797 100644 --- a/mlir/test/python/dialects/transform_interpreter.py +++ b/mlir/test/python/dialects/transform_interpreter.py @@ -54,3 +54,79 @@ def failed(): assert ( "must implement TransformOpInterface to be used as transform root" in str(e) ) + + +print_root_via_include_module = """ +module @print_root_via_include_module attributes {transform.with_named_sequence} { + transform.named_sequence private @callee1(%root: !transform.any_op {transform.readonly}) + transform.named_sequence private @callee2(%root: !transform.any_op {transform.readonly}) + transform.named_sequence @__transform_main(%root: !transform.any_op) { + transform.include @callee2 failures(propagate) + (%root) : (!transform.any_op) -> () + transform.yield + } +}""" + +callee2_definition = """ +module attributes {transform.with_named_sequence} { + transform.named_sequence private @callee1(%root: !transform.any_op {transform.readonly}) + transform.named_sequence @callee2(%root: !transform.any_op {transform.readonly}) { + transform.include @callee1 failures(propagate) + (%root) : (!transform.any_op) -> () + transform.yield + } +} +""" + +callee1_definition = """ +module attributes {transform.with_named_sequence} { + transform.named_sequence @callee1(%root: !transform.any_op {transform.readonly}) { + transform.print %root { name = \"from interpreter\" }: !transform.any_op + transform.yield + } +} +""" + + +@test_in_context +def include(): + main = ir.Module.parse(print_root_via_include_module) + callee1 = ir.Module.parse(callee1_definition) + callee2 = ir.Module.parse(callee2_definition) + interp.copy_symbols_and_merge_into(main, callee1) + interp.copy_symbols_and_merge_into(main, callee2) + + # CHECK: @print_root_via_include_module + # CHECK: transform.named_sequence @__transform_main + # CHECK: transform.include @callee2 + # + # CHECK: transform.named_sequence @callee1 + # CHECK: transform.print + # + # CHECK: transform.named_sequence @callee2 + # CHECK: transform.include @callee1 + interp.apply_named_sequence(main, main.body.operations[0], main) + + +@test_in_context +def partial_include(): + main = ir.Module.parse(print_root_via_include_module) + callee2 = ir.Module.parse(callee2_definition) + interp.copy_symbols_and_merge_into(main, callee2) + + try: + interp.apply_named_sequence(main, main.body.operations[0], main) + except ValueError as e: + assert "Failed to apply" in str(e) + + +@test_in_context +def repeated_include(): + main = ir.Module.parse(print_root_via_include_module) + callee2 = ir.Module.parse(callee2_definition) + interp.copy_symbols_and_merge_into(main, callee2) + + try: + interp.copy_symbols_and_merge_into(main, callee2) + except ValueError as e: + assert "doubly defined symbol @callee2" in str(e) From 20d653fdb2d4d6eafa4575cd954beaf7ecad113a Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 17 Apr 2024 08:04:19 -0500 Subject: [PATCH 252/300] [LLVM][CodeGen] Fix register lane liveness tracking in RegisterPressure (#88892) Re-enable an old assertion in `decreaseSetPressure`. --- llvm/lib/CodeGen/RegisterPressure.cpp | 39 ++++++++++++++------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp index f86aa3a167202f..3fa22447f416b7 100644 --- a/llvm/lib/CodeGen/RegisterPressure.cpp +++ b/llvm/lib/CodeGen/RegisterPressure.cpp @@ -64,7 +64,7 @@ static void increaseSetPressure(std::vector &CurrSetPressure, static void decreaseSetPressure(std::vector &CurrSetPressure, const MachineRegisterInfo &MRI, Register Reg, LaneBitmask PrevMask, LaneBitmask NewMask) { - //assert((NewMask & !PrevMask) == 0 && "Must not add bits"); + assert((NewMask & ~PrevMask).none() && "Must not add bits"); if (NewMask.any() || PrevMask.none()) return; @@ -617,17 +617,11 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS, ++I; } } - for (auto *I = Uses.begin(); I != Uses.end();) { - LaneBitmask LiveBefore = getLiveLanesAt(LIS, MRI, true, I->RegUnit, - Pos.getBaseIndex()); - LaneBitmask LaneMask = I->LaneMask & LiveBefore; - if (LaneMask.none()) { - I = Uses.erase(I); - } else { - I->LaneMask = LaneMask; - ++I; - } - } + + // For uses just copy the information from LIS. + for (auto &[RegUnit, LaneMask] : Uses) + LaneMask = getLiveLanesAt(LIS, MRI, true, RegUnit, Pos.getBaseIndex()); + if (AddFlagsMI != nullptr) { for (const RegisterMaskPair &P : DeadDefs) { Register RegUnit = P.RegUnit; @@ -1060,18 +1054,27 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) { // Kill liveness at live defs. for (const RegisterMaskPair &P : RegOpers.Defs) { Register Reg = P.RegUnit; - LaneBitmask LiveLanes = LiveRegs.contains(Reg); + LaneBitmask LiveAfter = LiveRegs.contains(Reg); LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg); LaneBitmask DefLanes = P.LaneMask; - LaneBitmask LiveAfter = (LiveLanes & ~DefLanes) | UseLanes; - decreaseRegPressure(Reg, LiveLanes, LiveAfter); + LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes; + + // There may be parts of the register that were dead before the + // instruction, but became live afterwards. Similarly, some parts + // may have been killed in this instruction. + decreaseRegPressure(Reg, LiveAfter, LiveAfter & LiveBefore); + increaseRegPressure(Reg, LiveAfter, ~LiveAfter & LiveBefore); } // Generate liveness for uses. for (const RegisterMaskPair &P : RegOpers.Uses) { Register Reg = P.RegUnit; - LaneBitmask LiveLanes = LiveRegs.contains(Reg); - LaneBitmask LiveAfter = LiveLanes | P.LaneMask; - increaseRegPressure(Reg, LiveLanes, LiveAfter); + // If this register was also in a def operand, we've handled it + // with defs. + if (getRegLanes(RegOpers.Defs, Reg).any()) + continue; + LaneBitmask LiveAfter = LiveRegs.contains(Reg); + LaneBitmask LiveBefore = LiveAfter | P.LaneMask; + increaseRegPressure(Reg, LiveAfter, LiveBefore); } } From 1fc72dbc807fb138cafd05501e2e31beaa574693 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 17 Apr 2024 21:10:56 +0800 Subject: [PATCH 253/300] [RISCV] Add test for doLocalPostpass issue not checking if VL was modified. NFC --- .../test/CodeGen/RISCV/rvv/vsetvli-insert.mir | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir index 1850abe6363bc9..d9a87c2cb12a86 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir @@ -76,6 +76,10 @@ ret void } + define void @postpass_modify_vl() { + ret void + } + declare @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(, , , i64) #1 declare @llvm.riscv.vle.nxv1i64.i64(, ptr nocapture, i64) #4 @@ -503,3 +507,24 @@ body: | %5:vr = PseudoVMV_V_I_MF2 $noreg, 1, 2, 5, 0 PseudoRET ... +--- +name: postpass_modify_vl +tracksRegLiveness: true +body: | + bb.0: + liveins: $x1 + ; CHECK-LABEL: name: postpass_modify_vl + ; CHECK: liveins: $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $vtype + ; CHECK-NEXT: $vl = COPY $x1 + ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype + ; CHECK-NEXT: PseudoRET + dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype + %1:gpr = COPY $vtype + $vl = COPY $x1 + dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype + %4:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0 + PseudoRET +... From 76ad2897480a85532eee93daf041246881772693 Mon Sep 17 00:00:00 2001 From: Zaara Syeda Date: Wed, 17 Apr 2024 09:24:53 -0400 Subject: [PATCH 254/300] [PowerPC] 32-bit large code-model support for toc-data (#85129) This patch adds the pseudo op ADDItocL for 32-bit large code-model support for toc-data. --- llvm/lib/Target/PowerPC/P10InstrResources.td | 2 +- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 61 ++++++++++++------ llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 35 +++++++--- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 1 + llvm/lib/Target/PowerPC/PPCInstrInfo.td | 4 +- llvm/lib/Target/PowerPC/PPCMacroFusion.def | 10 +-- llvm/test/CodeGen/PowerPC/toc-data.ll | 67 ++++++++++++++++++++ 7 files changed, 144 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td index 5015ba887d0b65..32cebb65cb5694 100644 --- a/llvm/lib/Target/PowerPC/P10InstrResources.td +++ b/llvm/lib/Target/PowerPC/P10InstrResources.td @@ -881,7 +881,7 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_ANY], // 3 Cycles ALU operations, 1 input operands def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read], (instrs - ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL8, LI, LI8, + ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL, ADDItocL8, LI, LI8, ADDIC, ADDIC8, ADDIS, ADDIS8, ADDISdtprelHA32, ADDIStocHA, ADDIStocHA8, LIS, LIS8, ADDME, ADDME8, diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 1c57b92057fff5..6e1002c45d81cb 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1148,15 +1148,27 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO); - // Always use TOC on AIX. Map the global address operand to be a reference - // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to - // reference the storage allocated in the TOC which contains the address of - // 'MOSymbol'. - MCSymbol *TOCEntry = - lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK); - const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry, - MCSymbolRefExpr::VK_PPC_U, - OutContext); + // If the symbol isn't toc-data then use the TOC on AIX. + // Map the global address operand to be a reference to the TOC entry we + // will synthesize later. 'TOCEntry' is a label used to reference the + // storage allocated in the TOC which contains the address of 'MOSymbol'. + // If the toc-data attribute is used, the TOC entry contains the data + // rather than the address of the MOSymbol. + if (![](const MachineOperand &MO) { + if (!MO.isGlobal()) + return false; + + const GlobalVariable *GV = dyn_cast(MO.getGlobal()); + if (!GV) + return false; + + return GV->hasAttribute("toc-data"); + }(MO)) { + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK); + } + + const MCExpr *Exp = MCSymbolRefExpr::create( + MOSymbol, MCSymbolRefExpr::VK_PPC_U, OutContext); TmpInst.getOperand(2) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; @@ -1273,25 +1285,32 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); return; } + case PPC::ADDItocL: case PPC::ADDItocL8: { - // Transform %xd = ADDItocL8 %xs, @sym + // Transform %xd = ADDItocL %xs, @sym LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); - // Change the opcode to ADDI8. If the global address is external, then - // generate a TOC entry and reference that. Otherwise, reference the - // symbol directly. - TmpInst.setOpcode(PPC::ADDI8); + unsigned Op = MI->getOpcode(); + + // Change the opcode to load address for tocdata + TmpInst.setOpcode(Op == PPC::ADDItocL8 ? PPC::ADDI8 : PPC::LA); const MachineOperand &MO = MI->getOperand(2); - assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL8."); + assert((Op == PPC::ADDItocL8) + ? (MO.isGlobal() || MO.isCPI()) + : MO.isGlobal() && "Invalid operand for ADDItocL8."); + assert(!(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) && + "Interposable definitions must use indirect accesses."); - LLVM_DEBUG(assert( - !(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) && - "Interposable definitions must use indirect access.")); + // Map the operand to its corresponding MCSymbol. + const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this); + + const MCExpr *Exp = MCSymbolRefExpr::create( + MOSymbol, + Op == PPC::ADDItocL8 ? MCSymbolRefExpr::VK_PPC_TOC_LO + : MCSymbolRefExpr::VK_PPC_L, + OutContext); - const MCExpr *Exp = - MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO, *this), - MCSymbolRefExpr::VK_PPC_TOC_LO, OutContext); TmpInst.getOperand(2) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index af82b6cdb1809e..2f647daa4bcb57 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -510,7 +510,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { } // Check if a SDValue has the toc-data attribute. -static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) { +static bool hasTocDataAttr(SDValue Val) { GlobalAddressSDNode *GA = dyn_cast(Val); if (!GA) return false; @@ -6115,8 +6115,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { assert(isAIXABI && "ELF ABI already handled"); - if (hasTocDataAttr(N->getOperand(0), - CurDAG->getDataLayout().getPointerSize())) { + if (hasTocDataAttr(N->getOperand(0))) { replaceWith(PPC::ADDItoc, N, MVT::i32); return; } @@ -6128,8 +6127,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (isPPC64 && CModel == CodeModel::Small) { assert(isAIXABI && "ELF ABI handled in common SelectCode"); - if (hasTocDataAttr(N->getOperand(0), - CurDAG->getDataLayout().getPointerSize())) { + if (hasTocDataAttr(N->getOperand(0))) { replaceWith(PPC::ADDItoc8, N, MVT::i64); return; } @@ -6144,9 +6142,10 @@ void PPCDAGToDAGISel::Select(SDNode *N) { " ELF/AIX or 32-bit AIX in the following."); // Transforms the ISD::TOC_ENTRY node for 32-bit AIX large code model mode - // or 64-bit medium (ELF-only) or large (ELF and AIX) code model code. We - // generate two instructions as described below. The first source operand - // is a symbol reference. If it must be toc-referenced according to + // or 64-bit medium (ELF-only) or large (ELF and AIX) code model code non + // toc-data symbols. + // We generate two instructions as described below. The first source + // operand is a symbol reference. If it must be toc-referenced according to // Subtarget, we generate: // [32-bit AIX] // LWZtocL(@sym, ADDIStocHA(%r2, @sym)) @@ -6154,6 +6153,13 @@ void PPCDAGToDAGISel::Select(SDNode *N) { // LDtocL(@sym, ADDIStocHA8(%x2, @sym)) // Otherwise we generate: // ADDItocL8(ADDIStocHA8(%x2, @sym), @sym) + + // For large code model toc-data symbols we generate: + // [32-bit AIX] + // ADDItocL(ADDIStocHA(%x2, @sym), @sym) + // [64-bit AIX] + // Currently not supported. + SDValue GA = N->getOperand(0); SDValue TOCbase = N->getOperand(1); @@ -6161,6 +6167,19 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SDNode *Tmp = CurDAG->getMachineNode( isPPC64 ? PPC::ADDIStocHA8 : PPC::ADDIStocHA, dl, VT, TOCbase, GA); + // On AIX if the symbol has the toc-data attribute it will be defined + // in the TOC entry, so we use an ADDItocL similar to the medium code + // model ELF abi. + if (isAIXABI && hasTocDataAttr(GA)) { + if (isPPC64) + report_fatal_error( + "64-bit large code model toc-data not yet supported"); + + ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, VT, + SDValue(Tmp, 0), GA)); + return; + } + if (PPCLowering->isAccessedAsGotIndirect(GA)) { // If it is accessed as got-indirect, we need an extra LWZ/LD to load // the address. diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 93874d65531aed..b32f178ca38e65 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1090,6 +1090,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable( case PPC::LIS8: case PPC::ADDIStocHA: case PPC::ADDIStocHA8: + case PPC::ADDItocL: case PPC::ADDItocL8: case PPC::LOAD_STACK_GUARD: case PPC::PPCLdFixedAddr: diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 6423e692d88c37..43e3902cf40746 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -3346,11 +3346,13 @@ def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentr "#ADDIStocHA", [(set i32:$rD, (PPCtoc_entry i32:$reg, tglobaladdr:$disp))]>; -// Local Data Transform +// TOC Data Transform AIX def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg), "#ADDItoc", [(set i32:$rD, (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>; +def ADDItocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp), + "#ADDItocL", []>; // Get Global (GOT) Base Register offset, from the word immediately preceding // the function label. diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def index fb6e656edb8b9b..1a61ae23e5d72d 100644 --- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def +++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def @@ -32,7 +32,7 @@ // {addi} followed by one of these {lxvd2x, lxvw4x, lxvdsx, lvebx, lvehx, // lvewx, lvx, lxsdx} FUSION_FEATURE(AddiLoad, hasAddiLoadFusion, 2, \ - FUSION_OP_SET(ADDI, ADDI8, ADDItocL8), \ + FUSION_OP_SET(ADDI, ADDI8, ADDItocL, ADDItocL8), \ FUSION_OP_SET(LXVD2X, LXVW4X, LXVDSX, LVEBX, LVEHX, LVEWX, \ LVX, LXSDX)) @@ -134,13 +134,13 @@ FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8), // addis rx,ra,si - addi rt,rx,SI, SI >= 0 FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1, - FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8), - FUSION_OP_SET(ADDI, ADDI8, ADDItocL8)) + FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8, ADDIStocHA), + FUSION_OP_SET(ADDI, ADDI8, ADDItocL8, ADDItocL)) // addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2 FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1, - FUSION_OP_SET(ADDI, ADDI8, ADDItocL8), - FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8)) + FUSION_OP_SET(ADDI, ADDI8, ADDItocL8, ADDItocL), + FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8, ADDIStocHA)) // mtctr - { bcctr,bcctrl } FUSION_FEATURE(ZeroMoveCTR, hasZeroMoveFusion, -1, diff --git a/llvm/test/CodeGen/PowerPC/toc-data.ll b/llvm/test/CodeGen/PowerPC/toc-data.ll index cbf3be9fcaad05..7f7afe76cfcdeb 100644 --- a/llvm/test/CodeGen/PowerPC/toc-data.ll +++ b/llvm/test/CodeGen/PowerPC/toc-data.ll @@ -12,6 +12,14 @@ ; RUN: llc -mtriple powerpc-ibm-aix-xcoff -verify-machineinstrs -O0 < %s | FileCheck %s --check-prefix TEST32 ; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -verify-machineinstrs -O0 < %s | FileCheck %s --check-prefix TEST64 +; RUN: llc -mtriple powerpc-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s \ +; RUN: -stop-before=ppc-vsx-copy | FileCheck %s --check-prefix CHECK32LARGE +; RUN: llc -mtriple powerpc-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefix TEST32LARGE + +; Global variables i and f have the toc-data attribute. +; In the following functions, those writing to or reading from +; variables i and f should use the toc-data access pattern. +; All remaining variables should use the regular toc access sequence. @i = dso_local global i32 0, align 4 #0 @d = dso_local local_unnamed_addr global double 3.141590e+00, align 8 @f = dso_local local_unnamed_addr global float 0x4005BE76C0000000, align 4 #0 @@ -44,6 +52,16 @@ define dso_local void @write_int(i32 signext %in) { ; TEST64: la 4, i[TD](2) ; TEST64-NEXT: stw 3, 0(4) +; CHECK32LARGE: name: write_int +; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @i +; CHECK32LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:gprc_and_gprc_nor0 = ADDItocL killed %[[SCRATCH1]], @i +; CHECK32LARGE-NEXT: STW %{{[0-9]+}}, 0, killed %[[SCRATCH2]] :: (store (s32) into @i) + +; FIXME: peephole optimization opportunity for lower part relocation @l to the consuming stw +; TEST32LARGE: .write_int: +; TEST32LARGE: addis 4, i[TD]@u(2) +; TEST32LARGE-NEXT: la 4, i[TD]@l(4) +; TEST32LARGE-NEXT: stw 3, 0(4) define dso_local i64 @read_ll() { entry: @@ -70,6 +88,15 @@ define dso_local i64 @read_ll() { ; TEST64: ld 3, L..C0(2) ; TEST64-NEXT: ld 3, 0(3) +; CHECK32LARGE: name: read_ll +; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @ll +; CHECK32LARGE: LWZtocL @ll, killed %[[SCRATCH1]] :: (load (s32) from got) + +; TEST32LARGE: .read_ll: +; TEST32LARGE: addis 3, L..C0@u(2) +; TEST32LARGE-NEXT: lwz 4, L..C0@l(3) +; TEST32LARGE-NEXT: lwz 3, 0(4) +; TEST32LARGE-NEXT: lwz 4, 4(4) define dso_local float @read_float() { entry: @@ -96,6 +123,16 @@ define dso_local float @read_float() { ; TEST64: la 3, f[TD](2) ; TEST64-NEXT: lfs 1, 0(3) +; CHECK32LARGE: name: read_float +; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @f +; CHECK32LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:gprc_and_gprc_nor0 = ADDItocL killed %[[SCRATCH1]], @f +; CHECK32LARGE-NEXT: LFS 0, killed %[[SCRATCH2]] :: (dereferenceable load (s32) from @f) + +; FIXME: peephole optimization opportunity for lower part relocation @l to the consuming lfs +; TEST32LARGE: .read_float: +; TEST32LARGE: addis 3, f[TD]@u(2) +; TEST32LARGE-NEXT: la 3, f[TD]@l(3) +; TEST32LARGE-NEXT: lfs 1, 0(3) define dso_local void @write_double(double %in) { entry: @@ -121,6 +158,14 @@ define dso_local void @write_double(double %in) { ; TEST64: ld 3, L..C1(2) ; TEST64-NEXT: stfd 1, 0(3) +; CHECK32LARGE: name: write_double +; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @d +; CHECK32LARGE: LWZtocL @d, killed %[[SCRATCH1]] :: (load (s32) from got) + +; TEST32LARGE: .write_double: +; TEST32LARGE: addis 3, L..C1@u(2) +; TEST32LARGE-NEXT: lwz 3, L..C1@l(3) +; TEST32LARGE-NEXT: stfd 1, 0(3) define dso_local nonnull ptr @addr() { entry: @@ -144,6 +189,15 @@ define dso_local nonnull ptr @addr() { ; TEST64: .addr ; TEST64: la 3, i[TD](2) +; CHECK32LARGE: name: addr +; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @i +; CHECK32LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:gprc = ADDItocL killed %[[SCRATCH1]], @i +; CHECK32LARGE-NEXT: $r3 = COPY %[[SCRATCH2]] + +; TEST32LARGE: .addr: +; TEST32LARGE: addis 3, i[TD]@u(2) +; TEST32LARGE-NEXT: la 3, i[TD]@l(3) + ; TEST32: .toc ; TEST32: .tc ll[TC],ll[RW] ; TEST32-NOT: .csect ll[TD] @@ -170,4 +224,17 @@ define dso_local nonnull ptr @addr() { ; TEST64-NEXT: .globl f[TD] ; TEST64-NOT: .tc f[TD],f[RW] +; TEST32LARGE: .toc +; TEST32LARGE: .tc ll[TE],ll[RW] +; TEST32LARGE-NOT: .csect ll[TD] +; TEST32LARGE: .tc d[TE],d[RW] +; TEST32LARGE-NOT: .csect d[TD],2 +; TEST32LARGE: .csect i[TD],2 +; TEST32LARGE-NEXT: .globl i[TD] +; TEST32LARGE-NEXT: .align 2 +; TEST32LARGE-NOT: .tc i[TE],i[RW] +; TEST32LARGE: .csect f[TD],2 +; TEST32LARGE-NEXT: .globl f[TD] +; TEST32LARGE-NOT: .tc f[TE],f[RW] + attributes #0 = { "toc-data" } From edbeae373489a2e710f328ceba50b4740c738217 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 17 Apr 2024 21:12:08 +0800 Subject: [PATCH 255/300] [RISCV] Explicitly bail if something modifies VL/VTYPE in doLocalPostpass If an instruction between MI and NextMI uses VL or VTYPE we demand the respective fields so as to not clobber them at their uses. But we don't consider if something might modify VL or VTYPE, and will happily coalesce two vsetvlis when we need to preserve them. This fixes this by skipping to the next vsetvli. Demanding the fields isn't enough, as we need to preserve the VL and VTYPE values even if no fields are demanded. In practice this doesn't happen, presumably due to there not being any instructions that write to VL or VTYPE without reading them. But I noticed this whilst working on a separate patch and split it out. --- llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 3 +++ llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index aab91adbb64be4..fa37d1ccccd737 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -1538,6 +1538,9 @@ void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) { if (!isVectorConfigInstr(MI)) { doUnion(Used, getDemanded(MI, MRI, ST)); + if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) || + MI.modifiesRegister(RISCV::VTYPE)) + NextMI = nullptr; continue; } diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir index d9a87c2cb12a86..e8620c848f8d3d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir @@ -516,9 +516,10 @@ body: | ; CHECK-LABEL: name: postpass_modify_vl ; CHECK: liveins: $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype + ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $vtype ; CHECK-NEXT: $vl = COPY $x1 + ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: PseudoRET dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype From 38205717501237f2b7a57eaabe65a8367e5f91c3 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Tue, 16 Apr 2024 14:57:27 -0400 Subject: [PATCH 256/300] [C99] Remove WG14 N522 from the C status page This paper is about type compatibility rules that changed in C99, but this is only applicable across translation units and so there's nothing for us to test. The specific change was that C89 allowed different tag types (e.g., struct and union) to be compatible and C99 tightened that restriction. This is a case where the user gets whatever they get if they link two TUs with incompatible tag types. --- clang/www/c_status.html | 5 ----- 1 file changed, 5 deletions(-) diff --git a/clang/www/c_status.html b/clang/www/c_status.html index 9893170ae84739..dfc1afefda245f 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -295,11 +295,6 @@

C99 implementation status

N570 Yes - - new structure type compatibility (tag compatibility) - N522 - Unknown - additional predefined macro names Unknown From 41b7341d6b27adf81262a5a0bd4e430675b73bbb Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 17 Apr 2024 14:56:46 +0100 Subject: [PATCH 257/300] [VPlan] Factor out helper to recursively collect all users (NFCI). Factor out logic to collect all users recursively to be re-used in https://github.com/llvm/llvm-project/pull/87816. --- llvm/lib/Transforms/Vectorize/VPlan.h | 5 ++++ .../Transforms/Vectorize/VPlanTransforms.cpp | 30 +++++++++---------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 148227f1f1a57b..334b10e2e5d097 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1049,6 +1049,11 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; } + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast(U); + return R && classof(R); + } + /// Drop all poison-generating flags. void dropPoisonGeneratingFlags() { // NOTE: This needs to be kept in-sync with diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 382bf5ac114053..78d0b5b95c5ec7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -852,6 +852,18 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan, return true; } +static SmallVector collectUsersRecursively(VPValue *V) { + SetVector Users(V->user_begin(), V->user_end()); + for (unsigned I = 0; I != Users.size(); ++I) { + VPRecipeBase *Cur = dyn_cast(Users[I]); + if (!Cur || isa(Cur)) + continue; + for (VPValue *V : Cur->definedValues()) + Users.insert(V->user_begin(), V->user_end()); + } + return Users.takeVector(); +} + void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { for (VPRecipeBase &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { @@ -863,24 +875,10 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) { if (RK != RecurKind::Add && RK != RecurKind::Mul) continue; - SmallSetVector Worklist; - Worklist.insert(PhiR); - - for (unsigned I = 0; I != Worklist.size(); ++I) { - VPValue *Cur = Worklist[I]; - if (auto *RecWithFlags = - dyn_cast(Cur->getDefiningRecipe())) { + for (VPUser *U : collectUsersRecursively(PhiR)) + if (auto *RecWithFlags = dyn_cast(U)) { RecWithFlags->dropPoisonGeneratingFlags(); } - - for (VPUser *U : Cur->users()) { - auto *UserRecipe = dyn_cast(U); - if (!UserRecipe) - continue; - for (VPValue *V : UserRecipe->definedValues()) - Worklist.insert(V); - } - } } } From 856d1c44103f09f2ed0448001de9dcda63055733 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 17 Apr 2024 14:58:13 +0100 Subject: [PATCH 258/300] [AMDGPU] Fix predicates for BUFFER_ATOMIC_FMIN/FMAX patterns (#89066) Use OtherPredicates to avoid interfering with other uses of SubtargetPredicate for GFX12. --- llvm/lib/Target/AMDGPU/BUFInstructions.td | 2 +- .../AMDGPU/fp-min-max-buffer-atomics.ll | 72 +++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 273f92abf35465..8053d89aeb0a89 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1726,7 +1726,7 @@ let SubtargetPredicate = isGFX12Plus in { defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>; } -let SubtargetPredicate = isGFX6GFX7GFX10Plus in { +let OtherPredicates = [isGFX6GFX7GFX10Plus] in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">; } diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll index 0c62b52eb92afc..587340c7aa342c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -4,12 +4,14 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1100 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX7 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1100 +; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12 declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32 immarg) declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32 immarg) @@ -70,6 +72,18 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; +; GFX12-LABEL: raw_buffer_atomic_min_noret_f32: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd @@ -170,6 +184,15 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, floa ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; +; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc @@ -292,6 +315,20 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre ; GFX1100-NEXT: ds_store_b32 v1, v0 ; GFX1100-NEXT: s_endpgm ; +; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: ds_store_b32 v1, v0 +; GFX12-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd @@ -427,6 +464,18 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; +; GFX12-LABEL: raw_buffer_atomic_max_noret_f32: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd @@ -527,6 +576,15 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, floa ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; +; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v[0:1], v0, off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc @@ -641,6 +699,20 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; +; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v1, v0, s[6:7] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 From 4f88c2311130791cf69da34b743b1b3ba7584a7b Mon Sep 17 00:00:00 2001 From: Guray Ozen Date: Wed, 17 Apr 2024 15:59:18 +0200 Subject: [PATCH 259/300] [mlir][py] Add NVGPU's `TensorMapDescriptorType` in py bindings (#88855) This PR adds NVGPU dialects' TensorMapDescriptorType in the py bindings. This is a follow-up issue from [this PR](https://github.com/llvm/llvm-project/pull/87153#discussion_r1546193095) --- mlir/include/mlir-c/Dialect/NVGPU.h | 11 ++++++ mlir/lib/Bindings/Python/DialectNVGPU.cpp | 41 +++++++++++++++++++++++ mlir/lib/CAPI/Dialect/NVGPU.cpp | 18 ++++++++++ mlir/python/CMakeLists.txt | 13 +++++++ mlir/python/mlir/dialects/nvgpu.py | 1 + mlir/test/python/dialects/nvgpu.py | 17 ++++++++++ 6 files changed, 101 insertions(+) create mode 100644 mlir/lib/Bindings/Python/DialectNVGPU.cpp diff --git a/mlir/include/mlir-c/Dialect/NVGPU.h b/mlir/include/mlir-c/Dialect/NVGPU.h index 580d566794c09f..e58015a4a3421a 100644 --- a/mlir/include/mlir-c/Dialect/NVGPU.h +++ b/mlir/include/mlir-c/Dialect/NVGPU.h @@ -11,6 +11,7 @@ #define MLIR_C_DIALECT_NVGPU_H #include "mlir-c/IR.h" +#include "mlir-c/Support.h" #ifdef __cplusplus extern "C" { @@ -18,6 +19,16 @@ extern "C" { MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(NVGPU, nvgpu); +//===---------------------------------------------------------------------===// +// TensorMapDescriptorType +//===---------------------------------------------------------------------===// + +MLIR_CAPI_EXPORTED bool mlirTypeIsANVGPUTensorMapDescriptorType(MlirType type); + +MLIR_CAPI_EXPORTED MlirType mlirNVGPUTensorMapDescriptorTypeGet( + MlirContext ctx, MlirType tensorMemrefType, int swizzle, int l2promo, + int oobFill, int interleave); + #ifdef __cplusplus } #endif diff --git a/mlir/lib/Bindings/Python/DialectNVGPU.cpp b/mlir/lib/Bindings/Python/DialectNVGPU.cpp new file mode 100644 index 00000000000000..341e4d55bcf219 --- /dev/null +++ b/mlir/lib/Bindings/Python/DialectNVGPU.cpp @@ -0,0 +1,41 @@ +//===--- DialectNvgpu.cpp - Pybind module for Nvgpu dialect API support ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir-c/Dialect/NVGPU.h" +#include "mlir-c/IR.h" +#include "mlir/Bindings/Python/PybindAdaptors.h" +#include + +namespace py = pybind11; +using namespace llvm; +using namespace mlir; +using namespace mlir::python; +using namespace mlir::python::adaptors; + +static void populateDialectNvgpuSubmodule(const pybind11::module &m) { + auto nvgpuTensorMapDescriptorType = mlir_type_subclass( + m, "TensorMapDescriptorType", mlirTypeIsANVGPUTensorMapDescriptorType); + + nvgpuTensorMapDescriptorType.def_classmethod( + "get", + [](py::object cls, MlirType tensorMemrefType, int swizzle, int l2promo, + int oobFill, int interleave, MlirContext ctx) { + return cls(mlirNVGPUTensorMapDescriptorTypeGet( + ctx, tensorMemrefType, swizzle, l2promo, oobFill, interleave)); + }, + "Gets an instance of TensorMapDescriptorType in the same context", + py::arg("cls"), py::arg("tensor_type"), py::arg("swizzle"), + py::arg("l2promo"), py::arg("oob_fill"), py::arg("interleave"), + py::arg("ctx") = py::none()); +} + +PYBIND11_MODULE(_mlirDialectsNvgpu, m) { + m.doc() = "MLIR NVGPU dialect."; + + populateDialectNvgpuSubmodule(m); +} diff --git a/mlir/lib/CAPI/Dialect/NVGPU.cpp b/mlir/lib/CAPI/Dialect/NVGPU.cpp index 02d10954a03776..e6da529e1b6b5f 100644 --- a/mlir/lib/CAPI/Dialect/NVGPU.cpp +++ b/mlir/lib/CAPI/Dialect/NVGPU.cpp @@ -9,5 +9,23 @@ #include "mlir-c/Dialect/NVGPU.h" #include "mlir/CAPI/Registration.h" #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h" +#include "mlir/IR/BuiltinTypes.h" + +using namespace mlir; +using namespace mlir::nvgpu; MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(NVGPU, nvgpu, mlir::nvgpu::NVGPUDialect) + +bool mlirTypeIsANVGPUTensorMapDescriptorType(MlirType type) { + return isa(unwrap(type)); +} + +MlirType mlirNVGPUTensorMapDescriptorTypeGet(MlirContext ctx, + MlirType tensorMemrefType, + int swizzle, int l2promo, + int oobFill, int interleave) { + return wrap(nvgpu::TensorMapDescriptorType::get( + unwrap(ctx), cast(unwrap(tensorMemrefType)), + TensorMapSwizzleKind(swizzle), TensorMapL2PromoKind(l2promo), + TensorMapOOBKind(oobFill), TensorMapInterleaveKind(interleave))); +} diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt index c27ee688a04087..0a2dc0754c09d0 100644 --- a/mlir/python/CMakeLists.txt +++ b/mlir/python/CMakeLists.txt @@ -524,6 +524,19 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.Quant.Pybind MLIRCAPIQuant ) +declare_mlir_python_extension(MLIRPythonExtension.Dialects.NVGPU.Pybind + MODULE_NAME _mlirDialectsNvgpu + ADD_TO_PARENT MLIRPythonSources.Dialects.nvgpu + ROOT_DIR "${PYTHON_SOURCE_DIR}" + SOURCES + DialectNVGPU.cpp + PRIVATE_LINK_LIBS + LLVMSupport + EMBED_CAPI_LINK_LIBS + MLIRCAPIIR + MLIRCAPINVGPU +) + declare_mlir_python_extension(MLIRPythonExtension.Dialects.PDL.Pybind MODULE_NAME _mlirDialectsPDL ADD_TO_PARENT MLIRPythonSources.Dialects.pdl diff --git a/mlir/python/mlir/dialects/nvgpu.py b/mlir/python/mlir/dialects/nvgpu.py index 2f6993b768ca53..e19bf610ea33a6 100644 --- a/mlir/python/mlir/dialects/nvgpu.py +++ b/mlir/python/mlir/dialects/nvgpu.py @@ -4,3 +4,4 @@ from ._nvgpu_ops_gen import * from ._nvgpu_enum_gen import * +from .._mlir_libs._mlirDialectsNvgpu import * diff --git a/mlir/test/python/dialects/nvgpu.py b/mlir/test/python/dialects/nvgpu.py index 3158388f0e6869..6df32bdd3c2739 100644 --- a/mlir/test/python/dialects/nvgpu.py +++ b/mlir/test/python/dialects/nvgpu.py @@ -15,6 +15,23 @@ def constructAndPrintInModule(f): return f +# CHECK-LABEL: testTypes +@constructAndPrintInModule +def testTypes(): + tensorMemrefType = MemRefType.get( + (128, 64), F16Type.get(), memory_space=Attribute.parse("3") + ) + # CHECK: !nvgpu.tensormap.descriptor, swizzle = swizzle_128b, l2promo = l2promo_256b, oob = nan, interleave = none> + tma_desc = nvgpu.TensorMapDescriptorType.get( + tensorMemrefType, + nvgpu.TensorMapSwizzleKind.SWIZZLE_128B, + nvgpu.TensorMapL2PromoKind.L2PROMO_256B, + nvgpu.TensorMapOOBKind.OOB_NAN, + nvgpu.TensorMapInterleaveKind.INTERLEAVE_NONE, + ) + print(tma_desc) + + # CHECK-LABEL: testSmoke @constructAndPrintInModule def testSmoke(): From fda04b1caaf1a61b208f23e717a2db6d9b861f5a Mon Sep 17 00:00:00 2001 From: Rajveer Singh Bharadwaj Date: Wed, 17 Apr 2024 19:37:18 +0530 Subject: [PATCH 260/300] [libc] Replace mentions of `LIBC_FULLBUILD` with `LLVM_LIBC_FULL_BUILD` in 'examples/' (#88657) Resolves #88328 --- libc/examples/README.md | 2 +- libc/examples/examples.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/examples/README.md b/libc/examples/README.md index 36b886090c6c1c..1bc4a67294f2a7 100644 --- a/libc/examples/README.md +++ b/libc/examples/README.md @@ -59,7 +59,7 @@ have installed them, you have to inform CMake that we are linking against the full libc as follows: ```bash -cmake ../ -G -DLIBC_FULLBUILD=ON \ +cmake ../ -G -DLLVM_LIBC_FULL_BUILD=ON \ -DCMAKE_SYSROOT= \ -DCMAKE_C_COMPILER=/bin/clang \ -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY diff --git a/libc/examples/examples.cmake b/libc/examples/examples.cmake index 81e99e3cbede9c..6bb6b41c252f5b 100644 --- a/libc/examples/examples.cmake +++ b/libc/examples/examples.cmake @@ -4,13 +4,13 @@ function(add_example name) ${ARGN} ) - if(LIBC_FULLBUILD) + if(LLVM_LIBC_FULL_BUILD) target_link_options(${name} PRIVATE -static -rtlib=compiler-rt -fuse-ld=lld) elseif(LIBC_OVERLAY_ARCHIVE_DIR) target_link_directories(${name} PRIVATE ${LIBC_OVERLAY_ARCHIVE_DIR}) target_link_options(${name} PRIVATE -l:libllvmlibc.a) else() - message(FATAL_ERROR "Either LIBC_FULLBUILD should be on or " + message(FATAL_ERROR "Either LLVM_LIBC_FULL_BUILD should be on or " "LIBC_OVERLAY_ARCHIVE_DIR should be set.") endif() endfunction() From d558c090fc78beb6737098f058a084635b893567 Mon Sep 17 00:00:00 2001 From: yronglin Date: Wed, 17 Apr 2024 22:11:04 +0800 Subject: [PATCH 261/300] [NFC] Clean dead code in ParsedAttr.h (#89064) Signed-off-by: yronglin --- clang/include/clang/Sema/ParsedAttr.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h index e3857b2f07d9e0..25a5fa05b21c7d 100644 --- a/clang/include/clang/Sema/ParsedAttr.h +++ b/clang/include/clang/Sema/ParsedAttr.h @@ -94,7 +94,7 @@ struct PropertyData { : GetterId(getterId), SetterId(setterId) {} }; -} // namespace +} // namespace detail /// Wraps an identifier and optional source location for the identifier. struct IdentifierLoc { @@ -743,11 +743,6 @@ class AttributePool { IdentifierInfo *scopeName, SourceLocation scopeLoc, ArgsUnion *args, unsigned numArgs, ParsedAttr::Form form, SourceLocation ellipsisLoc = SourceLocation()) { - size_t temp = - ParsedAttr::totalSizeToAlloc(numArgs, 0, 0, 0, 0); - (void)temp; void *memory = allocate( ParsedAttr::totalSizeToAlloc Date: Wed, 17 Apr 2024 14:29:09 +0000 Subject: [PATCH 262/300] [lldb] XFAIL TestDetachResumes on windows --- .../API/commands/process/detach-resumes/TestDetachResumes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py b/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py index 57727294ddc3d3..797db55a45b9b7 100644 --- a/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py +++ b/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py @@ -12,6 +12,7 @@ class DetachResumesTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr89077") def test_detach_resumes(self): self.build() exe = self.getBuildArtifact() From e49043512dbdc68319093da46e95a1e331ef837e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 17 Apr 2024 14:55:53 +0100 Subject: [PATCH 263/300] [CostModel][X86] Update BITREVERSE costs for GFNI targets Inspired by the recent patches by @shamithoke - we have real scheduler model numbers for GFNI instructions now, allowing us to calculate an upper bounds costs table instead of performing it analytically. --- .../lib/Target/X86/X86TargetTransformInfo.cpp | 41 +++-- .../CostModel/X86/bitreverse-codesize.ll | 64 +++---- .../CostModel/X86/bitreverse-latency.ll | 160 +++++++++--------- .../CostModel/X86/bitreverse-sizelatency.ll | 160 +++++++++--------- .../test/Analysis/CostModel/X86/bitreverse.ll | 136 +++++++-------- 5 files changed, 284 insertions(+), 277 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index b466624e133488..38064f97926992 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3820,6 +3820,24 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd }; + static const CostKindTblEntry GFNICostTbl[] = { + { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb + { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb + }; static const CostKindTblEntry GLMCostTbl[] = { { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps @@ -4156,23 +4174,6 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, std::pair LT = getTypeLegalizationCost(OpTy); MVT MTy = LT.second; - // Attempt to lookup cost. - if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && - MTy.isVector()) { - // With PSHUFB the code is very similar for all types. If we have integer - // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types - // we also need a PSHUFB. - unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; - - // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB - // instructions. We also need an extract and an insert. - if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || - (ST->hasBWI() && MTy.is512BitVector()))) - Cost = Cost * 2 + 2; - - return LT.first * Cost; - } - // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. if (((ISD == ISD::CTTZ && !ST->hasBMI()) || (ISD == ISD::CTLZ && !ST->hasLZCNT())) && @@ -4230,6 +4231,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags()); + if (ST->hasGFNI()) + if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy)) + if (auto KindCost = Entry->Cost[CostKind]) + return adjustTableCost(Entry->ISD, *KindCost, LT.first, + ICA.getFlags()); + if (ST->hasCDI()) if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) if (auto KindCost = Entry->Cost[CostKind]) diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll b/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll index 90e49e6ccb81ce..e02ba761aa8d29 100644 --- a/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll @@ -40,23 +40,23 @@ define i64 @var_bitreverse_i64(i64 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) @@ -77,23 +77,23 @@ define i32 @var_bitreverse_i32(i32 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) @@ -114,23 +114,23 @@ define i16 @var_bitreverse_i16(i16 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) @@ -151,23 +151,23 @@ define i8 @var_bitreverse_i8(i8 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i8' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i8' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i8' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) @@ -270,7 +270,7 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) { ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v4i64' @@ -323,7 +323,7 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) { ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i64' @@ -331,7 +331,7 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) { ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i64' @@ -421,7 +421,7 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) { ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i32' @@ -474,7 +474,7 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) { ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v16i32' @@ -482,7 +482,7 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) { ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i32' @@ -572,7 +572,7 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) { ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v16i16' @@ -625,7 +625,7 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) { ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v32i16' @@ -633,7 +633,7 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) { ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i16' @@ -723,7 +723,7 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) { ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v32i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v32i8' @@ -776,7 +776,7 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) { ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v64i8' @@ -784,7 +784,7 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) { ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v64i8' diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll b/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll index e7d0d05f824296..ba231b985c26bf 100644 --- a/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll @@ -40,23 +40,23 @@ define i64 @var_bitreverse_i64(i64 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) @@ -77,23 +77,23 @@ define i32 @var_bitreverse_i32(i32 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) @@ -114,23 +114,23 @@ define i16 @var_bitreverse_i16(i16 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) @@ -151,23 +151,23 @@ define i8 @var_bitreverse_i8(i8 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i8' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i8' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i8' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) @@ -221,23 +221,23 @@ define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v2i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) @@ -270,23 +270,23 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v4i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) @@ -323,23 +323,23 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v8i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) @@ -376,23 +376,23 @@ define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v4i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) @@ -425,23 +425,23 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v8i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) @@ -478,23 +478,23 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v16i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) @@ -531,23 +531,23 @@ define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v8i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) @@ -580,23 +580,23 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v16i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) @@ -633,23 +633,23 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v32i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) @@ -686,23 +686,23 @@ define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v16i8' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v16i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v16i8' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i8' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) @@ -735,23 +735,23 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v32i8' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v32i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v32i8' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i8' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) @@ -788,23 +788,23 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v64i8' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll index 1ba93af46ebdca..d60fac228fc069 100644 --- a/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll @@ -40,23 +40,23 @@ define i64 @var_bitreverse_i64(i64 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse ; %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) @@ -77,23 +77,23 @@ define i32 @var_bitreverse_i32(i32 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse ; %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) @@ -114,23 +114,23 @@ define i16 @var_bitreverse_i16(i16 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse ; %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) @@ -151,23 +151,23 @@ define i8 @var_bitreverse_i8(i8 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i8' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i8' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i8' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse ; %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) @@ -217,23 +217,23 @@ define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v2i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse ; %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) @@ -270,23 +270,23 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v4i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse ; %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) @@ -323,23 +323,23 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v8i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse ; %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) @@ -372,23 +372,23 @@ define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v4i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse ; %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) @@ -425,23 +425,23 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v8i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse ; %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) @@ -478,23 +478,23 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v16i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse ; %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) @@ -527,23 +527,23 @@ define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v8i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse ; %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) @@ -580,23 +580,23 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v16i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse ; %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) @@ -633,23 +633,23 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v32i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse ; %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) @@ -682,23 +682,23 @@ define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v16i8' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v16i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v16i8' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i8' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse ; %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) @@ -735,23 +735,23 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v32i8' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v32i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v32i8' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i8' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse ; %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) @@ -788,23 +788,23 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v64i8' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse ; %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse.ll b/llvm/test/Analysis/CostModel/X86/bitreverse.ll index 0b76b0d527ba1a..a890147fee465a 100644 --- a/llvm/test/Analysis/CostModel/X86/bitreverse.ll +++ b/llvm/test/Analysis/CostModel/X86/bitreverse.ll @@ -40,23 +40,23 @@ define i64 @var_bitreverse_i64(i64 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse ; %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) @@ -77,23 +77,23 @@ define i32 @var_bitreverse_i32(i32 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse ; %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) @@ -114,23 +114,23 @@ define i16 @var_bitreverse_i16(i16 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse ; %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) @@ -151,23 +151,23 @@ define i8 @var_bitreverse_i8(i8 %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_i8' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_i8' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_i8' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse ; %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) @@ -217,23 +217,23 @@ define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v2i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v2i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse ; %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) @@ -270,23 +270,23 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v4i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse ; %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) @@ -323,23 +323,23 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v8i64' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i64' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse ; %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a) @@ -372,23 +372,23 @@ define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v4i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse ; %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) @@ -425,23 +425,23 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v8i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse ; %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) @@ -478,23 +478,23 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v16i32' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i32' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse ; %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a) @@ -527,23 +527,23 @@ define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v8i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse ; %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) @@ -580,23 +580,23 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v16i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse ; %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) @@ -633,23 +633,23 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) { ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse ; ; GFNISSE-LABEL: 'var_bitreverse_v32i16' -; GFNISSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNISSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i16' -; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) +; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) ; GFNIAVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse ; %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a) @@ -743,7 +743,7 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) { ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v32i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v32i8' @@ -796,7 +796,7 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) { ; GFNISSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX2-LABEL: 'var_bitreverse_v64i8' @@ -804,7 +804,7 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) { ; GFNIAVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX512F-LABEL: 'var_bitreverse_v64i8' -; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) +; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a) ; GFNIAVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse ; ; GFNIAVX512BW-LABEL: 'var_bitreverse_v64i8' From 4a5ab13bf5a94ec7f0eabaf24dfe1a5ee720b860 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 17 Apr 2024 15:46:24 +0100 Subject: [PATCH 264/300] [VectorCombine] Remove single quotes from "-passes=vector-combine" These confuse the update_test_checks.py script when run by DOS cmd.exe --- llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll | 2 +- llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll | 2 +- llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll index eae08790048394..b27c026dbccf0b 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes='vector-combine' -S %s | FileCheck %s +; RUN: opt -passes=vector-combine -S %s | FileCheck %s target triple = "aarch64" diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index d96dfec849167d..a1d4ca1a740e7e 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes='vector-combine' -S %s | FileCheck %s +; RUN: opt -passes=vector-combine -S %s | FileCheck %s target triple = "aarch64" diff --git a/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll index d69cb75664a8ca..a22575ccb1ca21 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes='vector-combine' -S %s | FileCheck %s +; RUN: opt -passes=vector-combine -S %s | FileCheck %s target triple = "aarch64" From 5d314353fbec1a15cd8900f466dcdcf2af40e8c9 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 17 Apr 2024 15:53:15 +0100 Subject: [PATCH 265/300] [VPlan] Check for VPWidenLoadRecipe directly in truncateToMinBW. (NFCI). Since ne After a separate recipe has been introduced for wide loads in a9bafe91dd0, we can directly check for load recipes in the early bail-out and remove the redundant bail out for stores. --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 78d0b5b95c5ec7..901ecd10c69d8f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -975,9 +975,7 @@ void VPlanTransforms::truncateToMinimalBitwidths( vp_depth_first_deep(Plan.getVectorLoopRegion()))) { for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { if (!isa(&R)) - continue; - if (isa(&R)) + VPWidenSelectRecipe, VPWidenLoadRecipe>(&R)) continue; VPValue *ResultVPV = R.getVPSingleValue(); From 812963f6aa2adb5e990f273b8ce1a0eabcdefd7f Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Wed, 17 Apr 2024 17:02:07 +0200 Subject: [PATCH 266/300] [libc++][chrono] Improves date formatting. (#86127) The formatting of years has been done manually since the results of %Y outside the "typical" range may produce unexpected values. The same applies to %F which is identical to %Y-%m-%d. None of these conversion specifiers is affected by the locale used. So it's trivial to manually handle this case. This removes several platform specific ifdefs from the tests. --- libcxx/include/__chrono/formatter.h | 16 ++++----- .../time.cal.ymd.nonmembers/ostream.pass.cpp | 34 ------------------- .../sys_date.ostream.pass.cpp | 34 ------------------- .../formatter.year_month_day.pass.cpp | 24 ------------- 4 files changed, 7 insertions(+), 101 deletions(-) diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h index d932a99f4b9983..f76e7b2ea0e864 100644 --- a/libcxx/include/__chrono/formatter.h +++ b/libcxx/include/__chrono/formatter.h @@ -322,15 +322,13 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs( __formatter::__format_year(__sstr, __t.tm_year + 1900); break; - case _CharT('F'): { - int __year = __t.tm_year + 1900; - if (__year < 1000) { - __formatter::__format_year(__sstr, __year); - __sstr << std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "-{:02}-{:02}"), __t.tm_mon + 1, __t.tm_mday); - } else - __facet.put( - {__sstr}, __sstr, _CharT(' '), std::addressof(__t), std::to_address(__s), std::to_address(__it + 1)); - } break; + case _CharT('F'): + // Depending on the platform's libc the range of supported years is + // limited. Instead of testing all conditions use the internal + // implementation unconditionally. + __formatter::__format_year(__sstr, __t.tm_year + 1900); + __sstr << std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "-{:02}-{:02}"), __t.tm_mon + 1, __t.tm_mday); + break; case _CharT('z'): __formatter::__format_zone_offset(__sstr, __z.__offset, false); diff --git a/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp index ffc737fcad5dd2..20ffb165558e31 100644 --- a/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp +++ b/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp @@ -13,9 +13,6 @@ // TODO FMT This test should not require std::to_chars(floating-point) // XFAIL: availability-fp_to_chars-missing -// TODO FMT Investigate Windows issues. -// XFAIL: msvc - // REQUIRES: locale.fr_FR.UTF-8 // REQUIRES: locale.ja_JP.UTF-8 @@ -89,20 +86,9 @@ static void test() { TEST_EQUAL(stream_c_locale( std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}), SV("2000-02-29")); - -#if defined(_AIX) - TEST_EQUAL(stream_c_locale( - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}), - SV("+32767-12-31")); -#elif defined(_WIN32) // defined(_AIX) - TEST_EQUAL(stream_c_locale( - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}), - SV("")); -#else // defined(_AIX) TEST_EQUAL(stream_c_locale( std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}), SV("32767-12-31")); -#endif // defined(_AIX) TEST_EQUAL(stream_fr_FR_locale( std::chrono::year_month_day{std::chrono::year{-32'768}, std::chrono::month{1}, std::chrono::day{1}}), @@ -122,19 +108,9 @@ static void test() { TEST_EQUAL(stream_fr_FR_locale( std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}), SV("2000-02-29")); -#if defined(_AIX) - TEST_EQUAL(stream_fr_FR_locale( - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}), - SV("+32767-12-31")); -#elif defined(_WIN32) // defined(_AIX) - TEST_EQUAL(stream_fr_FR_locale( - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}), - SV("")); -#else // defined(_AIX) TEST_EQUAL(stream_fr_FR_locale( std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}), SV("32767-12-31")); -#endif // defined(_AIX) TEST_EQUAL(stream_ja_JP_locale( std::chrono::year_month_day{std::chrono::year{-32'768}, std::chrono::month{1}, std::chrono::day{1}}), @@ -154,19 +130,9 @@ static void test() { TEST_EQUAL(stream_ja_JP_locale( std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}), SV("2000-02-29")); -#if defined(_AIX) - TEST_EQUAL(stream_ja_JP_locale( - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}), - SV("+32767-12-31")); -#elif defined(_WIN32) // defined(_AIX) - TEST_EQUAL(stream_ja_JP_locale( - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}), - SV("")); -#else // defined(_AIX) TEST_EQUAL(stream_ja_JP_locale( std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}), SV("32767-12-31")); -#endif // defined(_AIX) } int main(int, char**) { diff --git a/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp index 7af3ebf7768072..c5645884cfed01 100644 --- a/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp +++ b/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp @@ -13,9 +13,6 @@ // TODO FMT This test should not require std::to_chars(floating-point) // XFAIL: availability-fp_to_chars-missing -// TODO FMT Investigate Windows issues. -// XFAIL: msvc - // REQUIRES: locale.fr_FR.UTF-8 // REQUIRES: locale.ja_JP.UTF-8 @@ -81,20 +78,9 @@ static void test() { TEST_EQUAL(stream_c_locale(std::chrono::sys_days{ std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}}), SV("2000-02-29")); - -#if defined(_AIX) - TEST_EQUAL(stream_c_locale(std::chrono::sys_days{ - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}), - SV("+32767-12-31")); -#elif defined(_WIN32) // defined(_AIX) - TEST_EQUAL(stream_c_locale(std::chrono::sys_days{ - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}), - SV("")); -#else // defined(_AIX) TEST_EQUAL(stream_c_locale(std::chrono::sys_days{ std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}), SV("32767-12-31")); -#endif // defined(_AIX) // multiples of days are considered days. TEST_EQUAL(stream_c_locale(std::chrono::sys_time{std::chrono::weeks{3}}), @@ -112,19 +98,9 @@ static void test() { TEST_EQUAL(stream_fr_FR_locale(std::chrono::sys_days{ std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}}), SV("2000-02-29")); -#if defined(_AIX) - TEST_EQUAL(stream_fr_FR_locale(std::chrono::sys_days{ - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}), - SV("+32767-12-31")); -#elif defined(_WIN32) // defined(_AIX) - TEST_EQUAL(stream_fr_FR_locale(std::chrono::sys_days{ - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}), - SV("")); -#else // defined(_AIX) TEST_EQUAL(stream_fr_FR_locale(std::chrono::sys_days{ std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}), SV("32767-12-31")); -#endif // defined(_AIX) // multiples of days are considered days. TEST_EQUAL(stream_fr_FR_locale(std::chrono::sys_time{std::chrono::weeks{3}}), @@ -142,19 +118,9 @@ static void test() { TEST_EQUAL(stream_ja_JP_locale(std::chrono::sys_days{ std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}}), SV("2000-02-29")); -#if defined(_AIX) - TEST_EQUAL(stream_ja_JP_locale(std::chrono::sys_days{ - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}), - SV("+32767-12-31")); -#elif defined(_WIN32) // defined(_AIX) - TEST_EQUAL(stream_ja_JP_locale(std::chrono::sys_days{ - std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}), - SV("")); -#else // defined(_AIX) TEST_EQUAL(stream_ja_JP_locale(std::chrono::sys_days{ std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}), SV("32767-12-31")); -#endif // defined(_AIX) // multiples of days are considered days. TEST_EQUAL(stream_ja_JP_locale(std::chrono::sys_time{std::chrono::weeks{3}}), diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp index 5a2b7afa37a865..1f2af1cb0530de 100644 --- a/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp +++ b/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp @@ -62,17 +62,6 @@ static void test_no_chrono_specs() { std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{2}, std::chrono::day{31}}); // Valid year, invalid month, valid day -#ifdef _WIN32 - check(SV(" is not a valid date"), - SV("{}"), - std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{31}}); - check(SV("****** is not a valid date******"), - SV("{:*^32}"), - std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{31}}); - check(SV("*********** is not a valid date"), - SV("{:*>31}"), - std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{31}}); -#else // _WIN32 check(SV("1970-00-31 is not a valid date"), SV("{}"), std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{31}}); @@ -82,20 +71,8 @@ static void test_no_chrono_specs() { check(SV("*1970-00-31 is not a valid date"), SV("{:*>31}"), std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{31}}); -#endif // _WIN32 // Valid year, invalid month, invalid day -#ifdef _WIN32 - check(SV(" is not a valid date"), - SV("{}"), - std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{32}}); - check(SV("****** is not a valid date******"), - SV("{:*^32}"), - std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{32}}); - check(SV("*********** is not a valid date"), - SV("{:*>31}"), - std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{32}}); -#else // _WIN32 check(SV("1970-00-32 is not a valid date"), SV("{}"), std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{32}}); @@ -105,7 +82,6 @@ static void test_no_chrono_specs() { check(SV("*1970-00-32 is not a valid date"), SV("{:*>31}"), std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{32}}); -#endif // _WIN32 // Invalid year, valid month, valid day check(SV("-32768-01-31 is not a valid date"), From 458328ae23d318a5055d5bac66426b8551bce01f Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Wed, 17 Apr 2024 18:01:55 +0300 Subject: [PATCH 267/300] [clang][NFC] Refactor `Sema::RedeclarationKind` This patch converts the enum into scoped enum, and moves it into its own header for the time being. It's definition is needed in `Sema.h`, and is going to be needed in upcoming `SemaObjC.h`. `Lookup.h` can't hold it, because it includes `Sema.h`. --- clang/include/clang/Sema/Lookup.h | 43 ++++++++++--------- clang/include/clang/Sema/Redeclaration.h | 31 +++++++++++++ clang/include/clang/Sema/Sema.h | 40 ++++------------- clang/lib/Interpreter/Interpreter.cpp | 3 +- clang/lib/Interpreter/InterpreterUtils.cpp | 2 +- clang/lib/Sema/SemaDecl.cpp | 24 ++++++----- clang/lib/Sema/SemaDeclCXX.cpp | 28 ++++++------ clang/lib/Sema/SemaExprCXX.cpp | 2 +- clang/lib/Sema/SemaExprMember.cpp | 2 +- clang/lib/Sema/SemaLookup.cpp | 16 ++++++- clang/lib/Sema/SemaOpenMP.cpp | 2 +- clang/lib/Sema/SemaTemplate.cpp | 5 ++- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 8 ++-- 13 files changed, 118 insertions(+), 88 deletions(-) create mode 100644 clang/include/clang/Sema/Redeclaration.h diff --git a/clang/include/clang/Sema/Lookup.h b/clang/include/clang/Sema/Lookup.h index 2f2f2607a937fe..0db5b847038ffd 100644 --- a/clang/include/clang/Sema/Lookup.h +++ b/clang/include/clang/Sema/Lookup.h @@ -153,28 +153,30 @@ class LookupResult { using iterator = UnresolvedSetImpl::iterator; - LookupResult(Sema &SemaRef, const DeclarationNameInfo &NameInfo, - Sema::LookupNameKind LookupKind, - Sema::RedeclarationKind Redecl = Sema::NotForRedeclaration) + LookupResult( + Sema &SemaRef, const DeclarationNameInfo &NameInfo, + Sema::LookupNameKind LookupKind, + RedeclarationKind Redecl = RedeclarationKind::NotForRedeclaration) : SemaPtr(&SemaRef), NameInfo(NameInfo), LookupKind(LookupKind), - Redecl(Redecl != Sema::NotForRedeclaration), - ExternalRedecl(Redecl == Sema::ForExternalRedeclaration), - DiagnoseAccess(Redecl == Sema::NotForRedeclaration), - DiagnoseAmbiguous(Redecl == Sema::NotForRedeclaration) { + Redecl(Redecl != RedeclarationKind::NotForRedeclaration), + ExternalRedecl(Redecl == RedeclarationKind::ForExternalRedeclaration), + DiagnoseAccess(Redecl == RedeclarationKind::NotForRedeclaration), + DiagnoseAmbiguous(Redecl == RedeclarationKind::NotForRedeclaration) { configure(); } // TODO: consider whether this constructor should be restricted to take // as input a const IdentifierInfo* (instead of Name), // forcing other cases towards the constructor taking a DNInfo. - LookupResult(Sema &SemaRef, DeclarationName Name, SourceLocation NameLoc, - Sema::LookupNameKind LookupKind, - Sema::RedeclarationKind Redecl = Sema::NotForRedeclaration) + LookupResult( + Sema &SemaRef, DeclarationName Name, SourceLocation NameLoc, + Sema::LookupNameKind LookupKind, + RedeclarationKind Redecl = RedeclarationKind::NotForRedeclaration) : SemaPtr(&SemaRef), NameInfo(Name, NameLoc), LookupKind(LookupKind), - Redecl(Redecl != Sema::NotForRedeclaration), - ExternalRedecl(Redecl == Sema::ForExternalRedeclaration), - DiagnoseAccess(Redecl == Sema::NotForRedeclaration), - DiagnoseAmbiguous(Redecl == Sema::NotForRedeclaration) { + Redecl(Redecl != RedeclarationKind::NotForRedeclaration), + ExternalRedecl(Redecl == RedeclarationKind::ForExternalRedeclaration), + DiagnoseAccess(Redecl == RedeclarationKind::NotForRedeclaration), + DiagnoseAmbiguous(Redecl == RedeclarationKind::NotForRedeclaration) { configure(); } @@ -285,9 +287,10 @@ class LookupResult { return ExternalRedecl; } - Sema::RedeclarationKind redeclarationKind() const { - return ExternalRedecl ? Sema::ForExternalRedeclaration : - Redecl ? Sema::ForVisibleRedeclaration : Sema::NotForRedeclaration; + RedeclarationKind redeclarationKind() const { + return ExternalRedecl ? RedeclarationKind::ForExternalRedeclaration + : Redecl ? RedeclarationKind::ForVisibleRedeclaration + : RedeclarationKind::NotForRedeclaration; } /// Specify whether hidden declarations are visible, e.g., @@ -615,9 +618,9 @@ class LookupResult { } /// Change this lookup's redeclaration kind. - void setRedeclarationKind(Sema::RedeclarationKind RK) { - Redecl = (RK != Sema::NotForRedeclaration); - ExternalRedecl = (RK == Sema::ForExternalRedeclaration); + void setRedeclarationKind(RedeclarationKind RK) { + Redecl = (RK != RedeclarationKind::NotForRedeclaration); + ExternalRedecl = (RK == RedeclarationKind::ForExternalRedeclaration); configure(); } diff --git a/clang/include/clang/Sema/Redeclaration.h b/clang/include/clang/Sema/Redeclaration.h new file mode 100644 index 00000000000000..ae18b922f5cd90 --- /dev/null +++ b/clang/include/clang/Sema/Redeclaration.h @@ -0,0 +1,31 @@ +//===- Redeclaration.h - Redeclarations--------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines RedeclarationKind enum. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_SEMA_REDECLARATION_H +#define LLVM_CLANG_SEMA_REDECLARATION_H + +/// Specifies whether (or how) name lookup is being performed for a +/// redeclaration (vs. a reference). +enum class RedeclarationKind { + /// The lookup is a reference to this name that is not for the + /// purpose of redeclaring the name. + NotForRedeclaration = 0, + /// The lookup results will be used for redeclaration of a name, + /// if an entity by that name already exists and is visible. + ForVisibleRedeclaration, + /// The lookup results will be used for redeclaration of a name + /// with external linkage; non-visible lookup results with external linkage + /// may also be found. + ForExternalRedeclaration +}; + +#endif // LLVM_CLANG_SEMA_REDECLARATION_H \ No newline at end of file diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 281e3b91de1d0c..1e89dfc58d92b1 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -52,6 +52,7 @@ #include "clang/Sema/IdentifierResolver.h" #include "clang/Sema/ObjCMethodList.h" #include "clang/Sema/Ownership.h" +#include "clang/Sema/Redeclaration.h" #include "clang/Sema/Scope.h" #include "clang/Sema/SemaBase.h" #include "clang/Sema/SemaConcept.h" @@ -7443,40 +7444,17 @@ class Sema final : public SemaBase { typedef std::function TypoRecoveryCallback; - /// Specifies whether (or how) name lookup is being performed for a - /// redeclaration (vs. a reference). - enum RedeclarationKind { - /// The lookup is a reference to this name that is not for the - /// purpose of redeclaring the name. - NotForRedeclaration = 0, - /// The lookup results will be used for redeclaration of a name, - /// if an entity by that name already exists and is visible. - ForVisibleRedeclaration, - /// The lookup results will be used for redeclaration of a name - /// with external linkage; non-visible lookup results with external linkage - /// may also be found. - ForExternalRedeclaration - }; - - RedeclarationKind forRedeclarationInCurContext() const { - // A declaration with an owning module for linkage can never link against - // anything that is not visible. We don't need to check linkage here; if - // the context has internal linkage, redeclaration lookup won't find things - // from other TUs, and we can't safely compute linkage yet in general. - if (cast(CurContext) - ->getOwningModuleForLinkage(/*IgnoreLinkage*/ true)) - return ForVisibleRedeclaration; - return ForExternalRedeclaration; - } + RedeclarationKind forRedeclarationInCurContext() const; /// Look up a name, looking for a single declaration. Return /// null if the results were absent, ambiguous, or overloaded. /// /// It is preferable to use the elaborated form and explicitly handle /// ambiguity and overloaded. - NamedDecl *LookupSingleName(Scope *S, DeclarationName Name, - SourceLocation Loc, LookupNameKind NameKind, - RedeclarationKind Redecl = NotForRedeclaration); + NamedDecl *LookupSingleName( + Scope *S, DeclarationName Name, SourceLocation Loc, + LookupNameKind NameKind, + RedeclarationKind Redecl = RedeclarationKind::NotForRedeclaration); bool LookupBuiltin(LookupResult &R); void LookupNecessaryTypesForBuiltin(Scope *S, unsigned ID); bool LookupName(LookupResult &R, Scope *S, bool AllowBuiltinCreation = false, @@ -7488,9 +7466,9 @@ class Sema final : public SemaBase { bool LookupParsedName(LookupResult &R, Scope *S, CXXScopeSpec *SS, bool AllowBuiltinCreation = false, bool EnteringContext = false); - ObjCProtocolDecl * - LookupProtocol(IdentifierInfo *II, SourceLocation IdLoc, - RedeclarationKind Redecl = NotForRedeclaration); + ObjCProtocolDecl *LookupProtocol( + IdentifierInfo *II, SourceLocation IdLoc, + RedeclarationKind Redecl = RedeclarationKind::NotForRedeclaration); bool LookupInSuper(LookupResult &R, CXXRecordDecl *Class); void LookupOverloadedOperatorName(OverloadedOperatorKind Op, Scope *S, diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp index cf31456b6950ac..b20e6efcebfd10 100644 --- a/clang/lib/Interpreter/Interpreter.cpp +++ b/clang/lib/Interpreter/Interpreter.cpp @@ -550,7 +550,8 @@ std::unique_ptr Interpreter::FindRuntimeInterface() { auto LookupInterface = [&](Expr *&Interface, llvm::StringRef Name) { LookupResult R(S, &Ctx.Idents.get(Name), SourceLocation(), - Sema::LookupOrdinaryName, Sema::ForVisibleRedeclaration); + Sema::LookupOrdinaryName, + RedeclarationKind::ForVisibleRedeclaration); S.LookupQualifiedName(R, Ctx.getTranslationUnitDecl()); if (R.empty()) return false; diff --git a/clang/lib/Interpreter/InterpreterUtils.cpp b/clang/lib/Interpreter/InterpreterUtils.cpp index c19cf6aa3156c9..45f6322b8461ed 100644 --- a/clang/lib/Interpreter/InterpreterUtils.cpp +++ b/clang/lib/Interpreter/InterpreterUtils.cpp @@ -72,7 +72,7 @@ NamedDecl *LookupNamed(Sema &S, llvm::StringRef Name, const DeclContext *Within) { DeclarationName DName = &S.Context.Idents.get(Name); LookupResult R(S, DName, SourceLocation(), Sema::LookupOrdinaryName, - Sema::ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); R.suppressDiagnostics(); diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 19abd5327b73aa..455ccb45b40687 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -5374,7 +5374,7 @@ static bool CheckAnonMemberRedeclaration(Sema &SemaRef, Scope *S, LookupResult R(SemaRef, Name, NameLoc, Owner->isRecord() ? Sema::LookupMemberName : Sema::LookupOrdinaryName, - Sema::ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); if (!SemaRef.LookupName(R, S)) return false; // Pick a representative declaration. @@ -6470,7 +6470,8 @@ NamedDecl *Sema::HandleDeclarator(Scope *S, Declarator &D, if (IsLinkageLookup) { Previous.clear(LookupRedeclarationWithLinkage); - Previous.setRedeclarationKind(ForExternalRedeclaration); + Previous.setRedeclarationKind( + RedeclarationKind::ForExternalRedeclaration); } LookupName(Previous, S, CreateBuiltins); @@ -8521,7 +8522,8 @@ void Sema::CheckShadow(Scope *S, VarDecl *D) { return; LookupResult R(*this, D->getDeclName(), D->getLocation(), - Sema::LookupOrdinaryName, Sema::ForVisibleRedeclaration); + Sema::LookupOrdinaryName, + RedeclarationKind::ForVisibleRedeclaration); LookupName(R, S); if (NamedDecl *ShadowedDecl = getShadowedDeclaration(D, R)) CheckShadow(D, ShadowedDecl, R); @@ -9161,7 +9163,7 @@ static NamedDecl *DiagnoseInvalidRedeclaration( LookupResult Prev(SemaRef, Name, NewFD->getLocation(), IsLocalFriend ? Sema::LookupLocalFriendName : Sema::LookupOrdinaryName, - Sema::ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); NewFD->setInvalidDecl(); if (IsLocalFriend) @@ -15196,7 +15198,7 @@ Decl *Sema::ActOnParamDeclarator(Scope *S, Declarator &D, const IdentifierInfo *II = D.getIdentifier(); if (II) { LookupResult R(*this, II, D.getIdentifierLoc(), LookupOrdinaryName, - ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); LookupName(R, S); if (!R.empty()) { NamedDecl *PrevDecl = *R.begin(); @@ -17428,7 +17430,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, RedeclarationKind Redecl = forRedeclarationInCurContext(); if (TUK == TUK_Friend || TUK == TUK_Reference) - Redecl = NotForRedeclaration; + Redecl = RedeclarationKind::NotForRedeclaration; /// Create a new tag decl in C/ObjC. Since the ODR-like semantics for ObjC/C /// implemented asks for structural equivalence checking, the returned decl @@ -18589,7 +18591,7 @@ FieldDecl *Sema::HandleField(Scope *S, RecordDecl *Record, // Check to see if this name was declared as a member previously NamedDecl *PrevDecl = nullptr; LookupResult Previous(*this, II, Loc, LookupMemberName, - ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); LookupName(Previous, S); switch (Previous.getResultKind()) { case LookupResult::Found: @@ -18993,8 +18995,9 @@ Decl *Sema::ActOnIvar(Scope *S, SourceLocation DeclStart, Declarator &D, NewID->setInvalidDecl(); if (II) { - NamedDecl *PrevDecl = LookupSingleName(S, II, Loc, LookupMemberName, - ForVisibleRedeclaration); + NamedDecl *PrevDecl = + LookupSingleName(S, II, Loc, LookupMemberName, + RedeclarationKind::ForVisibleRedeclaration); if (PrevDecl && isDeclInScope(PrevDecl, EnclosingContext, S) && !isa(PrevDecl)) { Diag(Loc, diag::err_duplicate_member) << II; @@ -20039,7 +20042,8 @@ Decl *Sema::ActOnEnumConstant(Scope *S, Decl *theEnumDecl, Decl *lastEnumConst, // Verify that there isn't already something declared with this name in this // scope. - LookupResult R(*this, Id, IdLoc, LookupOrdinaryName, ForVisibleRedeclaration); + LookupResult R(*this, Id, IdLoc, LookupOrdinaryName, + RedeclarationKind::ForVisibleRedeclaration); LookupName(R, S); NamedDecl *PrevDecl = R.getAsSingle(); diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 8c6bae545bfd15..591016243b0ac1 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -896,7 +896,7 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D, assert(VarName && "Cannot have an unnamed binding declaration"); LookupResult Previous(*this, NameInfo, LookupOrdinaryName, - ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); LookupName(Previous, S, /*CreateBuiltins*/DC->getRedeclContext()->isTranslationUnit()); @@ -951,7 +951,7 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D, DeclarationNameInfo NameInfo((IdentifierInfo *)nullptr, Decomp.getLSquareLoc()); LookupResult Previous(*this, NameInfo, LookupOrdinaryName, - ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); // Build the variable that holds the non-decomposed object. bool AddToScope = true; @@ -11715,7 +11715,7 @@ Decl *Sema::ActOnStartNamespaceDef(Scope *NamespcScope, // look through using directives, just look for any ordinary names // as if by qualified name lookup. LookupResult R(*this, II, IdentLoc, LookupOrdinaryName, - ForExternalRedeclaration); + RedeclarationKind::ForExternalRedeclaration); LookupQualifiedName(R, CurContext->getRedeclContext()); NamedDecl *PrevDecl = R.isSingleResult() ? R.getRepresentativeDecl() : nullptr; @@ -12916,7 +12916,7 @@ NamedDecl *Sema::BuildUsingDeclaration( // Do the redeclaration lookup in the current scope. LookupResult Previous(*this, UsingName, LookupUsingDeclName, - ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); Previous.setHideTags(false); if (S) { LookupName(Previous, S); @@ -13159,7 +13159,7 @@ NamedDecl *Sema::BuildUsingEnumDeclaration(Scope *S, AccessSpecifier AS, /// In class scope, check if this is a duplicate, for better a diagnostic. DeclarationNameInfo UsingEnumName(ED->getDeclName(), NameLoc); LookupResult Previous(*this, UsingEnumName, LookupUsingDeclName, - ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); LookupName(Previous, S); @@ -13192,7 +13192,7 @@ NamedDecl *Sema::BuildUsingEnumDeclaration(Scope *S, AccessSpecifier AS, UsingShadowDecl *PrevDecl = nullptr; DeclarationNameInfo DNI(EC->getDeclName(), EC->getLocation()); LookupResult Previous(*this, DNI, LookupOrdinaryName, - ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); LookupName(Previous, S); FilterUsingLookup(S, Previous); @@ -13587,7 +13587,7 @@ Decl *Sema::ActOnAliasDeclaration(Scope *S, AccessSpecifier AS, LookupResult Previous(*this, NameInfo, LookupOrdinaryName, TemplateParamLists.size() ? forRedeclarationInCurContext() - : ForVisibleRedeclaration); + : RedeclarationKind::ForVisibleRedeclaration); LookupName(Previous, S); // Warn about shadowing the name of a template parameter. @@ -13737,7 +13737,7 @@ Decl *Sema::ActOnNamespaceAliasDef(Scope *S, SourceLocation NamespaceLoc, // Check if we have a previous declaration with the same name. LookupResult PrevR(*this, Alias, AliasLoc, LookupOrdinaryName, - ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); LookupName(PrevR, S); // Check we're not shadowing a template parameter. @@ -13983,7 +13983,7 @@ void Sema::CheckImplicitSpecialMemberDeclaration(Scope *S, FunctionDecl *FD) { // implicit special members with this name. DeclarationName Name = FD->getDeclName(); LookupResult R(*this, Name, SourceLocation(), LookupOrdinaryName, - ForExternalRedeclaration); + RedeclarationKind::ForExternalRedeclaration); for (auto *D : FD->getParent()->lookup(Name)) if (auto *Acceptable = R.getAcceptableDecl(D)) R.addDecl(Acceptable); @@ -17113,9 +17113,9 @@ Decl *Sema::ActOnExceptionDeclarator(Scope *S, Declarator &D) { } const IdentifierInfo *II = D.getIdentifier(); - if (NamedDecl *PrevDecl = LookupSingleName(S, II, D.getIdentifierLoc(), - LookupOrdinaryName, - ForVisibleRedeclaration)) { + if (NamedDecl *PrevDecl = + LookupSingleName(S, II, D.getIdentifierLoc(), LookupOrdinaryName, + RedeclarationKind::ForVisibleRedeclaration)) { // The scope should be freshly made just for us. There is just no way // it contains any previous declaration, except for function parameters in // a function-try-block's catch statement. @@ -17906,7 +17906,7 @@ NamedDecl *Sema::ActOnFriendFunctionDecl(Scope *S, Declarator &D, DeclContext *DC; Scope *DCScope = S; LookupResult Previous(*this, NameInfo, LookupOrdinaryName, - ForExternalRedeclaration); + RedeclarationKind::ForExternalRedeclaration); bool isTemplateId = D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId; @@ -19242,7 +19242,7 @@ MSPropertyDecl *Sema::HandleMSProperty(Scope *S, RecordDecl *Record, // Check to see if this name was declared as a member previously NamedDecl *PrevDecl = nullptr; LookupResult Previous(*this, II, Loc, LookupMemberName, - ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); LookupName(Previous, S); switch (Previous.getResultKind()) { case LookupResult::Found: diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index f4a91ececfbb57..7582cbd75fec05 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -9153,7 +9153,7 @@ Sema::CheckMicrosoftIfExistsSymbol(Scope *S, // Do the redeclaration lookup in the current scope. LookupResult R(*this, TargetNameInfo, Sema::LookupAnyName, - Sema::NotForRedeclaration); + RedeclarationKind::NotForRedeclaration); LookupParsedName(R, S, &SS); R.suppressDiagnostics(); diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp index 7ea6d733fe5a2d..c79128bc8f39e7 100644 --- a/clang/lib/Sema/SemaExprMember.cpp +++ b/clang/lib/Sema/SemaExprMember.cpp @@ -728,7 +728,7 @@ static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R, Sema &SemaRef; DeclarationNameInfo NameInfo; Sema::LookupNameKind LookupKind; - Sema::RedeclarationKind Redecl; + RedeclarationKind Redecl; }; QueryState Q = {R.getSema(), R.getLookupNameInfo(), R.getLookupKind(), R.redeclarationKind()}; diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index d65f52b8efe81f..55af414df39f51 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -4449,7 +4449,8 @@ LabelDecl *Sema::LookupOrCreateLabel(IdentifierInfo *II, SourceLocation Loc, } // Not a GNU local label. - Res = LookupSingleName(CurScope, II, Loc, LookupLabel, NotForRedeclaration); + Res = LookupSingleName(CurScope, II, Loc, LookupLabel, + RedeclarationKind::NotForRedeclaration); // If we found a label, check to see if it is in the same context as us. // When in a Block, we don't want to reuse a label in an enclosing function. if (Res && Res->getDeclContext() != CurContext) @@ -5889,7 +5890,8 @@ void Sema::clearDelayedTypo(TypoExpr *TE) { void Sema::ActOnPragmaDump(Scope *S, SourceLocation IILoc, IdentifierInfo *II) { DeclarationNameInfo Name(II, IILoc); - LookupResult R(*this, Name, LookupAnyName, Sema::NotForRedeclaration); + LookupResult R(*this, Name, LookupAnyName, + RedeclarationKind::NotForRedeclaration); R.suppressDiagnostics(); R.setHideTags(false); LookupName(R, S); @@ -5899,3 +5901,13 @@ void Sema::ActOnPragmaDump(Scope *S, SourceLocation IILoc, IdentifierInfo *II) { void Sema::ActOnPragmaDump(Expr *E) { E->dump(); } + +RedeclarationKind Sema::forRedeclarationInCurContext() const { + // A declaration with an owning module for linkage can never link against + // anything that is not visible. We don't need to check linkage here; if + // the context has internal linkage, redeclaration lookup won't find things + // from other TUs, and we can't safely compute linkage yet in general. + if (cast(CurContext)->getOwningModuleForLinkage(/*IgnoreLinkage*/ true)) + return RedeclarationKind::ForVisibleRedeclaration; + return RedeclarationKind::ForExternalRedeclaration; +} diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index d229ef650bccb0..3e9f6cba25076d 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -24944,7 +24944,7 @@ ExprResult SemaOpenMP::ActOnOMPIteratorExpr(Scope *S, // Check for conflicting previous declaration. DeclarationNameInfo NameInfo(VD->getDeclName(), D.DeclIdentLoc); LookupResult Previous(SemaRef, NameInfo, Sema::LookupOrdinaryName, - Sema::ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); Previous.suppressDiagnostics(); SemaRef.LookupName(Previous, S); diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 95171359f0ab17..f4b6e1ceb6f023 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -972,8 +972,9 @@ void Sema::translateTemplateArguments(const ASTTemplateArgsPtr &TemplateArgsIn, static void maybeDiagnoseTemplateParameterShadow(Sema &SemaRef, Scope *S, SourceLocation Loc, const IdentifierInfo *Name) { - NamedDecl *PrevDecl = SemaRef.LookupSingleName( - S, Name, Loc, Sema::LookupOrdinaryName, Sema::ForVisibleRedeclaration); + NamedDecl *PrevDecl = + SemaRef.LookupSingleName(S, Name, Loc, Sema::LookupOrdinaryName, + RedeclarationKind::ForVisibleRedeclaration); if (PrevDecl && PrevDecl->isTemplateParameter()) SemaRef.DiagnoseTemplateParameterShadow(Loc, PrevDecl); } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 6d359c5a9a024c..caa07abb61fe34 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -2296,7 +2296,7 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl( SemaRef, Function->getDeclName(), SourceLocation(), D->isLocalExternDecl() ? Sema::LookupRedeclarationWithLinkage : Sema::LookupOrdinaryName, - D->isLocalExternDecl() ? Sema::ForExternalRedeclaration + D->isLocalExternDecl() ? RedeclarationKind::ForExternalRedeclaration : SemaRef.forRedeclarationInCurContext()); if (DependentFunctionTemplateSpecializationInfo *DFTSI = @@ -2697,7 +2697,7 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl( Method->setInvalidDecl(); LookupResult Previous(SemaRef, NameInfo, Sema::LookupOrdinaryName, - Sema::ForExternalRedeclaration); + RedeclarationKind::ForExternalRedeclaration); bool IsExplicitSpecialization = false; @@ -3365,7 +3365,7 @@ Decl *TemplateDeclInstantiator::VisitUsingDecl(UsingDecl *D) { // fact, it's not really even possible in non-class scopes). bool CheckRedeclaration = Owner->isRecord(); LookupResult Prev(SemaRef, NameInfo, Sema::LookupUsingDeclName, - Sema::ForVisibleRedeclaration); + RedeclarationKind::ForVisibleRedeclaration); UsingDecl *NewUD = UsingDecl::Create(SemaRef.Context, Owner, D->getUsingLoc(), @@ -5388,7 +5388,7 @@ void Sema::BuildVariableInstantiation( *this, NewVar->getDeclName(), NewVar->getLocation(), NewVar->isLocalExternDecl() ? Sema::LookupRedeclarationWithLinkage : Sema::LookupOrdinaryName, - NewVar->isLocalExternDecl() ? Sema::ForExternalRedeclaration + NewVar->isLocalExternDecl() ? RedeclarationKind::ForExternalRedeclaration : forRedeclarationInCurContext()); if (NewVar->isLocalExternDecl() && OldVar->getPreviousDecl() && From 950bb097e11d6ee26533c00519c62df994322228 Mon Sep 17 00:00:00 2001 From: Dinar Temirbulatov Date: Wed, 17 Apr 2024 15:30:40 +0000 Subject: [PATCH 268/300] Revert "[Clang][AArch64] Warn when calling non/streaming about vector size difference (#79842)" This reverts commit 4e85e1ffcaf161736e27a24c291c1177be865976 --- clang/include/clang/Basic/DiagnosticGroups.td | 3 - .../clang/Basic/DiagnosticSemaKinds.td | 10 -- clang/lib/Sema/SemaChecking.cpp | 22 +-- clang/lib/Sema/SemaDecl.cpp | 12 +- .../Sema/aarch64-incompat-sm-builtin-calls.c | 6 +- clang/test/Sema/aarch64-sme-func-attrs.c | 136 +----------------- 6 files changed, 5 insertions(+), 184 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 47747d8704b6c8..5251774ff4efd6 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1412,9 +1412,6 @@ def MultiGPU: DiagGroup<"multi-gpu">; // libc and the CRT to be skipped. def AVRRtlibLinkingQuirks : DiagGroup<"avr-rtlib-linking-quirks">; -// A warning group related to AArch64 SME function attribues. -def AArch64SMEAttributes : DiagGroup<"aarch64-sme-attributes">; - // A warning group for things that will change semantics in the future. def FutureCompat : DiagGroup<"future-compat">; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 44f802c0c28e84..30a8543489f48e 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3751,16 +3751,6 @@ def err_sme_definition_using_za_in_non_sme_target : Error< "function using ZA state requires 'sme'">; def err_sme_definition_using_zt0_in_non_sme2_target : Error< "function using ZT0 state requires 'sme2'">; -def warn_sme_streaming_pass_return_vl_to_non_streaming : Warning< - "passing a VL-dependent argument to/from a function that has a different" - " streaming-mode. The streaming and non-streaming vector lengths may be" - " different">, - InGroup, DefaultIgnore; -def warn_sme_locally_streaming_has_vl_args_returns : Warning< - "passing/returning a VL-dependent argument to/from a __arm_locally_streaming" - " function. The streaming and non-streaming vector" - " lengths may be different">, - InGroup, DefaultIgnore; def err_conflicting_attributes_arm_state : Error< "conflicting attributes for state '%0'">; def err_sme_streaming_cannot_be_multiversioned : Error< diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 99b0a00083535e..d814327d23470f 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -7949,7 +7949,6 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, // For variadic functions, we may have more args than parameters. // For some K&R functions, we may have less args than parameters. const auto N = std::min(Proto->getNumParams(), Args.size()); - bool AnyScalableArgsOrRet = Proto->getReturnType()->isSizelessVectorType(); for (unsigned ArgIdx = 0; ArgIdx < N; ++ArgIdx) { // Args[ArgIdx] can be null in malformed code. if (const Expr *Arg = Args[ArgIdx]) { @@ -7963,8 +7962,6 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, checkAIXMemberAlignment((Arg->getExprLoc()), Arg); QualType ParamTy = Proto->getParamType(ArgIdx); - if (ParamTy->isSizelessVectorType()) - AnyScalableArgsOrRet = true; QualType ArgTy = Arg->getType(); CheckArgAlignment(Arg->getExprLoc(), FDecl, std::to_string(ArgIdx + 1), ArgTy, ParamTy); @@ -7985,23 +7982,6 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, } } - // If the call requires a streaming-mode change and has scalable vector - // arguments or return values, then warn the user that the streaming and - // non-streaming vector lengths may be different. - const auto *CallerFD = dyn_cast(CurContext); - if (CallerFD && (!FD || !FD->getBuiltinID()) && AnyScalableArgsOrRet) { - bool IsCalleeStreaming = - ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask; - bool IsCalleeStreamingCompatible = - ExtInfo.AArch64SMEAttributes & - FunctionType::SME_PStateSMCompatibleMask; - ArmStreamingType CallerFnType = getArmStreamingFnType(CallerFD); - if (!IsCalleeStreamingCompatible && - (CallerFnType == ArmStreamingCompatible || - ((CallerFnType == ArmStreaming) ^ IsCalleeStreaming))) - Diag(Loc, diag::warn_sme_streaming_pass_return_vl_to_non_streaming); - } - FunctionType::ArmStateValue CalleeArmZAState = FunctionType::getArmZAState(ExtInfo.AArch64SMEAttributes); FunctionType::ArmStateValue CalleeArmZT0State = @@ -8010,7 +7990,7 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, CalleeArmZT0State != FunctionType::ARM_None) { bool CallerHasZAState = false; bool CallerHasZT0State = false; - if (CallerFD) { + if (const auto *CallerFD = dyn_cast(CurContext)) { auto *Attr = CallerFD->getAttr(); if (Attr && Attr->isNewZA()) CallerHasZAState = true; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 455ccb45b40687..1bde99d6fce740 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -12408,22 +12408,12 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD, } // Check if the function definition uses any AArch64 SME features without - // having the '+sme' feature enabled and warn user if sme locally streaming - // function returns or uses arguments with VL-based types. + // having the '+sme' feature enabled. if (DeclIsDefn) { const auto *Attr = NewFD->getAttr(); bool UsesSM = NewFD->hasAttr(); bool UsesZA = Attr && Attr->isNewZA(); bool UsesZT0 = Attr && Attr->isNewZT0(); - - if (NewFD->hasAttr()) { - if (NewFD->getReturnType()->isSizelessVectorType() || - llvm::any_of(NewFD->parameters(), [](ParmVarDecl *P) { - return P->getOriginalType()->isSizelessVectorType(); - })) - Diag(NewFD->getLocation(), - diag::warn_sme_locally_streaming_has_vl_args_returns); - } if (const auto *FPT = NewFD->getType()->getAs()) { FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo(); UsesSM |= diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c index 6a1feeb9bf5397..55c97c73e8b695 100644 --- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve \ -// RUN: -target-feature +sme2 -target-feature +sve2 -target-feature +neon -Waarch64-sme-attributes -fsyntax-only -verify %s +// RUN: -target-feature +sme2 -target-feature +sve2 -target-feature +neon -fsyntax-only -verify %s // REQUIRES: aarch64-registered-target @@ -33,7 +33,6 @@ svuint32_t incompat_sve_sm(svbool_t pg, svuint32_t a, int16_t b) __arm_streaming return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); } -// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} __arm_locally_streaming svuint32_t incompat_sve_ls(svbool_t pg, svuint32_t a, int64_t b) { // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); @@ -49,7 +48,6 @@ svuint32_t incompat_sve2_sm(svbool_t pg, svuint32_t a, int64_t b) __arm_streamin return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); } -// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} __arm_locally_streaming svuint32_t incompat_sve2_ls(svbool_t pg, svuint32_t a, int64_t b) { // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}} return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b); @@ -70,7 +68,6 @@ svfloat64_t streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) __arm_ return svadd_n_f64_m(pg, a, b); } -// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} __arm_locally_streaming svfloat64_t locally_streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) { // expected-no-warning return svadd_n_f64_m(pg, a, b); @@ -86,7 +83,6 @@ svint16_t streaming_caller_sve2(svint16_t op1, svint16_t op2) __arm_streaming { return svmul_lane_s16(op1, op2, 0); } -// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} __arm_locally_streaming svint16_t locally_streaming_caller_sve2(svint16_t op1, svint16_t op2) { // expected-no-warning return svmul_lane_s16(op1, op2, 0); diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c index 12de16509ccb8d..bfc8768c3f36e1 100644 --- a/clang/test/Sema/aarch64-sme-func-attrs.c +++ b/clang/test/Sema/aarch64-sme-func-attrs.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -Waarch64-sme-attributes -fsyntax-only -verify %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -Waarch64-sme-attributes -fsyntax-only -verify=expected-cpp -x c++ %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify=expected-cpp -x c++ %s // Valid attributes @@ -496,135 +496,3 @@ void fmv_caller() { just_fine(); incompatible_locally_streaming(); } - -void sme_streaming_with_vl_arg(__SVInt8_t a) __arm_streaming { } - -__SVInt8_t sme_streaming_returns_vl(void) __arm_streaming { __SVInt8_t r; return r; } - -void sme_streaming_compatible_with_vl_arg(__SVInt8_t a) __arm_streaming_compatible { } - -__SVInt8_t sme_streaming_compatible_returns_vl(void) __arm_streaming_compatible { __SVInt8_t r; return r; } - -void sme_no_streaming_with_vl_arg(__SVInt8_t a) { } - -__SVInt8_t sme_no_streaming_returns_vl(void) { __SVInt8_t r; return r; } - -// expected-warning@+2 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} -// expected-cpp-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} -__arm_locally_streaming void sme_locally_streaming_with_vl_arg(__SVInt8_t a) { } - -// expected-warning@+2 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} -// expected-cpp-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}} -__arm_locally_streaming __SVInt8_t sme_locally_streaming_returns_vl(void) { __SVInt8_t r; return r; } - -void sme_no_streaming_calling_streaming_with_vl_args() { - __SVInt8_t a; - // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - sme_streaming_with_vl_arg(a); -} - -void sme_no_streaming_calling_streaming_with_return_vl() { - // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - __SVInt8_t r = sme_streaming_returns_vl(); -} - -void sme_streaming_calling_non_streaming_with_vl_args(void) __arm_streaming { - __SVInt8_t a; - // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - sme_no_streaming_with_vl_arg(a); -} - -void sme_streaming_calling_non_streaming_with_return_vl(void) __arm_streaming { - // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - __SVInt8_t r = sme_no_streaming_returns_vl(); -} - -void sme_no_streaming_calling_streaming_with_vl_args_param(__SVInt8_t arg, void (*sc)( __SVInt8_t arg) __arm_streaming) { - // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - sc(arg); -} - -__SVInt8_t sme_no_streaming_calling_streaming_return_vl_param(__SVInt8_t (*s)(void) __arm_streaming) { - // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - return s(); -} - -void sme_streaming_compatible_calling_streaming_with_vl_args(__SVInt8_t arg) __arm_streaming_compatible { - // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - sme_streaming_with_vl_arg(arg); -} - -void sme_streaming_compatible_calling_sme_streaming_return_vl(void) __arm_streaming_compatible { - // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - __SVInt8_t r = sme_streaming_returns_vl(); -} - -void sme_streaming_compatible_calling_no_streaming_with_vl_args(__SVInt8_t arg) __arm_streaming_compatible { - // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - sme_no_streaming_with_vl_arg(arg); -} - -void sme_streaming_compatible_calling_no_sme_streaming_return_vl(void) __arm_streaming_compatible { - // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}} - __SVInt8_t r = sme_no_streaming_returns_vl(); -} - -void sme_streaming_calling_streaming(__SVInt8_t arg, void (*s)( __SVInt8_t arg) __arm_streaming) __arm_streaming { - s(arg); -} - -__SVInt8_t sme_streaming_calling_streaming_return_vl(__SVInt8_t (*s)(void) __arm_streaming) __arm_streaming { - return s(); -} - -void sme_streaming_calling_streaming_with_vl_args(__SVInt8_t a) __arm_streaming { - sme_streaming_with_vl_arg(a); -} - -void sme_streaming_calling_streaming_with_return_vl(void) __arm_streaming { - __SVInt8_t r = sme_streaming_returns_vl(); -} - -void sme_streaming_calling_streaming_compatible_with_vl_args(__SVInt8_t a) __arm_streaming { - sme_streaming_compatible_with_vl_arg(a); -} - -void sme_streaming_calling_streaming_compatible_with_return_vl(void) __arm_streaming { - __SVInt8_t r = sme_streaming_compatible_returns_vl(); -} - -void sme_no_streaming_calling_streaming_compatible_with_vl_args() { - __SVInt8_t a; - sme_streaming_compatible_with_vl_arg(a); -} - -void sme_no_streaming_calling_streaming_compatible_with_return_vl() { - __SVInt8_t r = sme_streaming_compatible_returns_vl(); -} - -void sme_no_streaming_calling_non_streaming_compatible_with_vl_args() { - __SVInt8_t a; - sme_no_streaming_with_vl_arg(a); -} - -void sme_no_streaming_calling_non_streaming_compatible_with_return_vl() { - __SVInt8_t r = sme_no_streaming_returns_vl(); -} - -void sme_streaming_compatible_calling_streaming_compatible_with_vl_args(__SVInt8_t arg) __arm_streaming_compatible { - sme_streaming_compatible_with_vl_arg(arg); -} - -void sme_streaming_compatible_calling_streaming_compatible_with_return_vl(void) __arm_streaming_compatible { - __SVInt8_t r = sme_streaming_compatible_returns_vl(); -} From b854a2323337be2633b1135f590678a17e9d1ade Mon Sep 17 00:00:00 2001 From: Robin Caloudis Date: Wed, 17 Apr 2024 17:38:47 +0200 Subject: [PATCH 269/300] [libc][c23][fenv] Implement fetestexceptflag (#87828) Provide C23 `fetestexceptflag` function according to 7.6.4.6 in the latest [revision of the C standard](https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3096.pdf) from 2023-04-02. Closes https://github.com/llvm/llvm-project/issues/87565. --- libc/config/baremetal/arm/entrypoints.txt | 1 + libc/config/baremetal/riscv/entrypoints.txt | 1 + libc/config/darwin/arm/entrypoints.txt | 1 + libc/config/darwin/x86_64/entrypoints.txt | 1 + libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/arm/entrypoints.txt | 1 + libc/config/linux/riscv/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/config/windows/entrypoints.txt | 1 + libc/docs/c23.rst | 2 +- libc/docs/fenv.rst | 4 +-- libc/spec/stdc.td | 5 +++ libc/src/fenv/CMakeLists.txt | 13 ++++++++ libc/src/fenv/fetestexceptflag.cpp | 23 +++++++++++++ libc/src/fenv/fetestexceptflag.h | 20 ++++++++++++ libc/test/src/fenv/CMakeLists.txt | 1 + libc/test/src/fenv/exception_flags_test.cpp | 32 +++++++++++++++---- .../llvm-project-overlay/libc/BUILD.bazel | 10 ++++++ .../libc/test/src/fenv/BUILD.bazel | 1 + 19 files changed, 110 insertions(+), 10 deletions(-) create mode 100644 libc/src/fenv/fetestexceptflag.cpp create mode 100644 libc/src/fenv/fetestexceptflag.h diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index f33f9430c79205..4e3d1cb9f5337a 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -201,6 +201,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.fenv.fesetround libc.src.fenv.feraiseexcept libc.src.fenv.fetestexcept + libc.src.fenv.fetestexceptflag libc.src.fenv.feupdateenv # math.h entrypoints diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt index dad187fa0496d3..7efd9bcd5b3cb8 100644 --- a/libc/config/baremetal/riscv/entrypoints.txt +++ b/libc/config/baremetal/riscv/entrypoints.txt @@ -201,6 +201,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.fenv.fesetround libc.src.fenv.feraiseexcept libc.src.fenv.fetestexcept + libc.src.fenv.fetestexceptflag libc.src.fenv.feupdateenv # math.h entrypoints diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt index aea2f6d5771e87..e1303265b9ac41 100644 --- a/libc/config/darwin/arm/entrypoints.txt +++ b/libc/config/darwin/arm/entrypoints.txt @@ -112,6 +112,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.fenv.fesetround libc.src.fenv.feraiseexcept libc.src.fenv.fetestexcept + libc.src.fenv.fetestexceptflag libc.src.fenv.feupdateenv # math.h entrypoints diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt index 09fe3d7b476870..02912decadcf79 100644 --- a/libc/config/darwin/x86_64/entrypoints.txt +++ b/libc/config/darwin/x86_64/entrypoints.txt @@ -106,6 +106,7 @@ set(TARGET_LIBM_ENTRYPOINTS # libc.src.fenv.fesetround # libc.src.fenv.feraiseexcept # libc.src.fenv.fetestexcept + # libc.src.fenv.fetestexceptflag # libc.src.fenv.feupdateenv ## Currently disabled for failing tests. diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 2952baacdd67fd..1ac6bd93000082 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -324,6 +324,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.fenv.fesetround libc.src.fenv.feraiseexcept libc.src.fenv.fetestexcept + libc.src.fenv.fetestexceptflag libc.src.fenv.feupdateenv # math.h entrypoints diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt index 35fd588a9a6c4d..335981ff7dc7cf 100644 --- a/libc/config/linux/arm/entrypoints.txt +++ b/libc/config/linux/arm/entrypoints.txt @@ -192,6 +192,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.fenv.fesetround libc.src.fenv.feraiseexcept libc.src.fenv.fetestexcept + libc.src.fenv.fetestexceptflag libc.src.fenv.feupdateenv # math.h entrypoints diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 47c03a61c45a93..87e82e5eb9a067 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -332,6 +332,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.fenv.fesetround libc.src.fenv.feraiseexcept libc.src.fenv.fetestexcept + libc.src.fenv.fetestexceptflag libc.src.fenv.feupdateenv # math.h entrypoints diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 8fdd4575e27e28..70f130a4399a36 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -346,6 +346,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.fenv.fesetround libc.src.fenv.feraiseexcept libc.src.fenv.fetestexcept + libc.src.fenv.fetestexceptflag libc.src.fenv.feupdateenv # math.h entrypoints diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt index c46c947bf31354..71216530c4041b 100644 --- a/libc/config/windows/entrypoints.txt +++ b/libc/config/windows/entrypoints.txt @@ -110,6 +110,7 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.fenv.fesetround libc.src.fenv.feraiseexcept libc.src.fenv.fetestexcept + libc.src.fenv.fetestexceptflag libc.src.fenv.feupdateenv # math.h entrypoints diff --git a/libc/docs/c23.rst b/libc/docs/c23.rst index 4138c9d7104f33..44724fe1660cbe 100644 --- a/libc/docs/c23.rst +++ b/libc/docs/c23.rst @@ -21,7 +21,7 @@ Additions: * fenv.h * fesetexcept |check| - * fetestexceptflag + * fetestexceptflag |check| * fegetmode * fesetmode * math.h diff --git a/libc/docs/fenv.rst b/libc/docs/fenv.rst index 6574fb7246ddd2..1dee5515e1174b 100644 --- a/libc/docs/fenv.rst +++ b/libc/docs/fenv.rst @@ -42,7 +42,7 @@ fenv.h Functions - |check| - 7.6.6.3 * - fesetexcept - - + - |check| - 7.6.4.4 * - fesetexceptflag - |check| @@ -57,7 +57,7 @@ fenv.h Functions - |check| - 7.6.4.7 * - fetestexceptflag - - + - |check| - 7.6.4.6 * - feupdateenv - |check| diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 63d0449867114d..01aa7c70b3b9df 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -149,6 +149,11 @@ def StdC : StandardSpec<"stdc"> { RetValSpec, [ArgSpec] >, + FunctionSpec< + "fetestexceptflag", + RetValSpec, + [ArgSpec, ArgSpec] + >, FunctionSpec< "feraiseexcept", RetValSpec, diff --git a/libc/src/fenv/CMakeLists.txt b/libc/src/fenv/CMakeLists.txt index 17e99474120627..c5431b1b9d55e0 100644 --- a/libc/src/fenv/CMakeLists.txt +++ b/libc/src/fenv/CMakeLists.txt @@ -58,6 +58,19 @@ add_entrypoint_object( -O2 ) +add_entrypoint_object( + fetestexceptflag + SRCS + fetestexceptflag.cpp + HDRS + fetestexceptflag.h + DEPENDS + libc.hdr.types.fexcept_t + libc.src.__support.FPUtil.fenv_impl + COMPILE_OPTIONS + -O2 +) + add_entrypoint_object( fegetenv SRCS diff --git a/libc/src/fenv/fetestexceptflag.cpp b/libc/src/fenv/fetestexceptflag.cpp new file mode 100644 index 00000000000000..63453350a199f5 --- /dev/null +++ b/libc/src/fenv/fetestexceptflag.cpp @@ -0,0 +1,23 @@ +//===-- Implementation of fetestexceptflag function -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/fenv/fetestexceptflag.h" +#include "hdr/types/fexcept_t.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(int, fetestexceptflag, + (const fexcept_t *flagp, int excepts)) { + static_assert(sizeof(int) >= sizeof(fexcept_t), + "fexcept_t value cannot fit in an int value."); + return *flagp | fputil::test_except(excepts); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/fenv/fetestexceptflag.h b/libc/src/fenv/fetestexceptflag.h new file mode 100644 index 00000000000000..1c8b0b843f5477 --- /dev/null +++ b/libc/src/fenv/fetestexceptflag.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fetestexceptflag --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_FENV_FETESTEXCEPTFLAG_H +#define LLVM_LIBC_SRC_FENV_FETESTEXCEPTFLAG_H + +#include "hdr/types/fexcept_t.h" + +namespace LIBC_NAMESPACE { + +int fetestexceptflag(const fexcept_t *, int excepts); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_FENV_FETESTEXCEPTFLAG_H diff --git a/libc/test/src/fenv/CMakeLists.txt b/libc/test/src/fenv/CMakeLists.txt index 577735599dc010..f277b65e2d42be 100644 --- a/libc/test/src/fenv/CMakeLists.txt +++ b/libc/test/src/fenv/CMakeLists.txt @@ -48,6 +48,7 @@ add_libc_unittest( DEPENDS libc.src.fenv.fegetexceptflag libc.src.fenv.fesetexceptflag + libc.src.fenv.fetestexceptflag libc.src.__support.FPUtil.fenv_impl ) diff --git a/libc/test/src/fenv/exception_flags_test.cpp b/libc/test/src/fenv/exception_flags_test.cpp index d1d8bfcc53db56..9d2be6426a6d0b 100644 --- a/libc/test/src/fenv/exception_flags_test.cpp +++ b/libc/test/src/fenv/exception_flags_test.cpp @@ -1,4 +1,4 @@ -//===-- Unittests for fegetexceptflag and fesetexceptflag -----------------===// +//===-- Unittests for fe{get|set|test}exceptflag --------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -9,11 +9,12 @@ #include "hdr/types/fexcept_t.h" #include "src/fenv/fegetexceptflag.h" #include "src/fenv/fesetexceptflag.h" +#include "src/fenv/fetestexceptflag.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "test/UnitTest/Test.h" -TEST(LlvmLibcFenvTest, GetExceptFlagAndSetExceptFlag) { +TEST(LlvmLibcFenvTest, GetSetTestExceptFlag) { // We will disable all exceptions to prevent invocation of the exception // handler. LIBC_NAMESPACE::fputil::disable_except(FE_ALL_EXCEPT); @@ -39,19 +40,36 @@ TEST(LlvmLibcFenvTest, GetExceptFlagAndSetExceptFlag) { ASSERT_EQ(LIBC_NAMESPACE::fesetexceptflag(&eflags, FE_ALL_EXCEPT), 0); ASSERT_NE(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT) & e, 0); + // Exception flags are exactly the flags corresponding to the previously + // raised exception. + ASSERT_EQ(LIBC_NAMESPACE::fetestexceptflag(&eflags, FE_ALL_EXCEPT), + LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT)); + // Cleanup. We clear all excepts as raising excepts like FE_OVERFLOW // can also raise FE_INEXACT. LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); } - // Next, we will raise one exception and save the flags. + // Next, we will raise one exception, save the flag and clear all exceptions. LIBC_NAMESPACE::fputil::raise_except(FE_INVALID); - fexcept_t eflags; - LIBC_NAMESPACE::fegetexceptflag(&eflags, FE_ALL_EXCEPT); - // Clear all exceptions and raise two other exceptions. + fexcept_t invalid_flag; + LIBC_NAMESPACE::fegetexceptflag(&invalid_flag, FE_ALL_EXCEPT); + ASSERT_EQ(LIBC_NAMESPACE::fetestexceptflag(&invalid_flag, FE_ALL_EXCEPT), + FE_INVALID); LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); + + // Raise two other exceptions and verify that they are set. LIBC_NAMESPACE::fputil::raise_except(FE_OVERFLOW | FE_INEXACT); + fexcept_t overflow_and_inexact_flag; + LIBC_NAMESPACE::fegetexceptflag(&overflow_and_inexact_flag, FE_ALL_EXCEPT); + ASSERT_EQ(LIBC_NAMESPACE::fetestexceptflag(&overflow_and_inexact_flag, + FE_ALL_EXCEPT), + FE_OVERFLOW | FE_INEXACT); + ASSERT_EQ(LIBC_NAMESPACE::fetestexceptflag(&overflow_and_inexact_flag, + FE_OVERFLOW | FE_INEXACT), + FE_OVERFLOW | FE_INEXACT); + // When we set the flags and test, we should only see FE_INVALID. - LIBC_NAMESPACE::fesetexceptflag(&eflags, FE_ALL_EXCEPT); + LIBC_NAMESPACE::fesetexceptflag(&invalid_flag, FE_ALL_EXCEPT); EXPECT_EQ(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT), FE_INVALID); } diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index fb37f113b310a7..6029cc3fee6108 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1145,6 +1145,16 @@ libc_function( ], ) +libc_function( + name = "fetestexceptflag", + srcs = ["src/fenv/fetestexceptflag.cpp"], + hdrs = ["src/fenv/fetestexceptflag.h"], + deps = [ + ":__support_common", + ":__support_fputil_fenv_impl", + ], +) + libc_function( name = "feclearexcept", srcs = ["src/fenv/feclearexcept.cpp"], diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel index 359db0723dfd3c..fc3ab3da3587c5 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel @@ -75,6 +75,7 @@ libc_test( libc_function_deps = [ "//libc:fegetexceptflag", "//libc:fesetexceptflag", + "//libc:fetestexceptflag", ], deps = [ "//libc:__support_fputil_fenv_impl", From 8656d4c6a7a742c6fa6ee02c2ace7415163e65e4 Mon Sep 17 00:00:00 2001 From: Krystian Stasiowski Date: Wed, 17 Apr 2024 11:41:03 -0400 Subject: [PATCH 270/300] [Clang][Parse] Diagnose requires expressions with explicit object parameters (#88974) Clang currently allows the following: ``` auto x = requires (this int) { true; }; ``` This patch addresses that. --- clang/docs/ReleaseNotes.rst | 2 ++ clang/include/clang/Basic/DiagnosticParseKinds.td | 2 ++ clang/lib/Parse/ParseDecl.cpp | 15 ++++++++++++++- .../CXX/dcl.decl/dcl.meaning/dcl.fct/p6-cxx23.cpp | 7 +++++++ 4 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p6-cxx23.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 96ad92b540b47f..c19ad9fba58f37 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -364,6 +364,8 @@ Improvements to Clang's diagnostics - Clang now uses the correct type-parameter-key (``class`` or ``typename``) when printing template template parameter declarations. +- Clang now diagnoses requires expressions with explicit object parameters. + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index bb9ca2a50cc06c..66405095d51de8 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -863,6 +863,8 @@ def err_empty_requires_expr : Error< "a requires expression must contain at least one requirement">; def err_requires_expr_parameter_list_ellipsis : Error< "varargs not allowed in requires expression">; +def err_requires_expr_explicit_object_parameter: Error< + "a requires expression cannot have an explicit object parameter">; def err_expected_semi_requirement : Error< "expected ';' at end of requirement">; def err_requires_expr_missing_arrow : Error< diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 274ee7b10c1787..5f26b5a9e46bef 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -7660,8 +7660,21 @@ void Parser::ParseParameterDeclarationClause( // Parse a C++23 Explicit Object Parameter // We do that in all language modes to produce a better diagnostic. SourceLocation ThisLoc; - if (getLangOpts().CPlusPlus && Tok.is(tok::kw_this)) + if (getLangOpts().CPlusPlus && Tok.is(tok::kw_this)) { ThisLoc = ConsumeToken(); + // C++23 [dcl.fct]p6: + // An explicit-object-parameter-declaration is a parameter-declaration + // with a this specifier. An explicit-object-parameter-declaration + // shall appear only as the first parameter-declaration of a + // parameter-declaration-list of either: + // - a member-declarator that declares a member function, or + // - a lambda-declarator. + // + // The parameter-declaration-list of a requires-expression is not such + // a context. + if (DeclaratorCtx == DeclaratorContext::RequiresExpr) + Diag(ThisLoc, diag::err_requires_expr_explicit_object_parameter); + } ParseDeclarationSpecifiers(DS, /*TemplateInfo=*/ParsedTemplateInfo(), AS_none, DeclSpecContext::DSC_normal, diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p6-cxx23.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p6-cxx23.cpp new file mode 100644 index 00000000000000..9c1f30f81a0115 --- /dev/null +++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p6-cxx23.cpp @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s + +auto x0 = requires (this int) { true; }; // expected-error {{a requires expression cannot have an explicit object parameter}} +auto x1 = requires (int, this int) { true; }; // expected-error {{a requires expression cannot have an explicit object parameter}} + +template // expected-error {{expected template parameter}} +void f(); // expected-error {{no function template matches function template specialization 'f'}} From abd5e45a96954d80f6ffe6d8676c0059fae8573b Mon Sep 17 00:00:00 2001 From: Alexander Richardson Date: Wed, 17 Apr 2024 08:42:41 -0700 Subject: [PATCH 271/300] [compiler-rt] Use __atomic builtins whenever possible The code in this file dates back to 2012 when Clang's support for atomic builtins was still quite limited. The bugs referenced in the comment at the top of the file have long been fixed and using the compiler builtins directly should now generate slightly better code. Additionally, this allows using the atomic builtin header for platforms where the __sync_builtins are lacking (e.g. Arm Morello). This change does not introduce any code generation changes for __tsan_read*/__tsan_write* or __tsan_func_{entry,exit} on x86, which indicates the previously noted compiler issues have been fixed. We also have to touch the non-clang codepaths here since the only way we can make this work easily is by making the memory_order enum match the compiler-provided macros, so we have to update the debug checks that assumed the enum was always a bitflag. The one downside of this change is that 32-bit MIPS now definitely requires libatomic (but that may already have been needed for RMW ops). Reviewed By: dvyukov Pull Request: https://github.com/llvm/llvm-project/pull/84439 --- .../lib/sanitizer_common/CMakeLists.txt | 3 - .../lib/sanitizer_common/sanitizer_atomic.h | 12 ++ .../sanitizer_common/sanitizer_atomic_clang.h | 85 ++++++------- .../sanitizer_atomic_clang_mips.h | 117 ------------------ .../sanitizer_atomic_clang_other.h | 85 ------------- .../sanitizer_atomic_clang_x86.h | 113 ----------------- .../sanitizer_common/sanitizer_atomic_msvc.h | 8 +- .../compiler-rt/lib/sanitizer_common/BUILD.gn | 3 - 8 files changed, 56 insertions(+), 370 deletions(-) delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h diff --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt index f2b4ac72ae1573..66f2d259aa5fd4 100644 --- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt +++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt @@ -122,9 +122,6 @@ set(SANITIZER_IMPL_HEADERS sanitizer_asm.h sanitizer_atomic.h sanitizer_atomic_clang.h - sanitizer_atomic_clang_mips.h - sanitizer_atomic_clang_other.h - sanitizer_atomic_clang_x86.h sanitizer_atomic_msvc.h sanitizer_bitvector.h sanitizer_bvgraph.h diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h index 46f06957228c9b..0609a11ffdebb0 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h @@ -18,12 +18,24 @@ namespace __sanitizer { enum memory_order { +// If the __atomic atomic builtins are supported (Clang/GCC), use the +// compiler provided macro values so that we can map the atomic operations +// to __atomic_* directly. +#ifdef __ATOMIC_SEQ_CST + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_consume = __ATOMIC_CONSUME, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +#else memory_order_relaxed = 1 << 0, memory_order_consume = 1 << 1, memory_order_acquire = 1 << 2, memory_order_release = 1 << 3, memory_order_acq_rel = 1 << 4, memory_order_seq_cst = 1 << 5 +#endif }; struct atomic_uint8_t { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h index 4318d64d16cfa2..1414092e38d7e2 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h @@ -14,60 +14,63 @@ #ifndef SANITIZER_ATOMIC_CLANG_H #define SANITIZER_ATOMIC_CLANG_H -#if defined(__i386__) || defined(__x86_64__) -# include "sanitizer_atomic_clang_x86.h" -#else -# include "sanitizer_atomic_clang_other.h" -#endif - namespace __sanitizer { -// We would like to just use compiler builtin atomic operations -// for loads and stores, but they are mostly broken in clang: -// - they lead to vastly inefficient code generation -// (http://llvm.org/bugs/show_bug.cgi?id=17281) -// - 64-bit atomic operations are not implemented on x86_32 -// (http://llvm.org/bugs/show_bug.cgi?id=15034) -// - they are not implemented on ARM -// error: undefined reference to '__atomic_load_4' +// We use the compiler builtin atomic operations for loads and stores, which +// generates correct code for all architectures, but may require libatomic +// on platforms where e.g. 64-bit atomics are not supported natively. // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html // for mappings of the memory model to different processors. -inline void atomic_signal_fence(memory_order) { +inline void atomic_signal_fence(memory_order mo) { __atomic_signal_fence(mo); } + +inline void atomic_thread_fence(memory_order mo) { __atomic_thread_fence(mo); } + +inline void proc_yield(int cnt) { + __asm__ __volatile__("" ::: "memory"); +#if defined(__i386__) || defined(__x86_64__) + for (int i = 0; i < cnt; i++) __asm__ __volatile__("pause"); __asm__ __volatile__("" ::: "memory"); +#endif } -inline void atomic_thread_fence(memory_order) { - __sync_synchronize(); +template +inline typename T::Type atomic_load(const volatile T *a, memory_order mo) { + DCHECK(mo == memory_order_relaxed || mo == memory_order_consume || + mo == memory_order_acquire || mo == memory_order_seq_cst); + DCHECK(!((uptr)a % sizeof(*a))); + return __atomic_load_n(&a->val_dont_use, mo); } -template -inline typename T::Type atomic_fetch_add(volatile T *a, - typename T::Type v, memory_order mo) { - (void)mo; +template +inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { + DCHECK(mo == memory_order_relaxed || mo == memory_order_release || + mo == memory_order_seq_cst); DCHECK(!((uptr)a % sizeof(*a))); - return __sync_fetch_and_add(&a->val_dont_use, v); + __atomic_store_n(&a->val_dont_use, v, mo); } -template -inline typename T::Type atomic_fetch_sub(volatile T *a, - typename T::Type v, memory_order mo) { +template +inline typename T::Type atomic_fetch_add(volatile T *a, typename T::Type v, + memory_order mo) { + DCHECK(!((uptr)a % sizeof(*a))); + return __atomic_fetch_add(&a->val_dont_use, v, mo); +} + +template +inline typename T::Type atomic_fetch_sub(volatile T *a, typename T::Type v, + memory_order mo) { (void)mo; DCHECK(!((uptr)a % sizeof(*a))); - return __sync_fetch_and_add(&a->val_dont_use, -v); + return __atomic_fetch_sub(&a->val_dont_use, v, mo); } -template -inline typename T::Type atomic_exchange(volatile T *a, - typename T::Type v, memory_order mo) { +template +inline typename T::Type atomic_exchange(volatile T *a, typename T::Type v, + memory_order mo) { DCHECK(!((uptr)a % sizeof(*a))); - if (mo & (memory_order_release | memory_order_acq_rel | memory_order_seq_cst)) - __sync_synchronize(); - v = __sync_lock_test_and_set(&a->val_dont_use, v); - if (mo == memory_order_seq_cst) - __sync_synchronize(); - return v; + return __atomic_exchange_n(&a->val_dont_use, v, mo); } template @@ -82,9 +85,8 @@ inline bool atomic_compare_exchange_strong(volatile T *a, typename T::Type *cmp, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); } -template -inline bool atomic_compare_exchange_weak(volatile T *a, - typename T::Type *cmp, +template +inline bool atomic_compare_exchange_weak(volatile T *a, typename T::Type *cmp, typename T::Type xchg, memory_order mo) { return atomic_compare_exchange_strong(a, cmp, xchg, mo); @@ -92,13 +94,6 @@ inline bool atomic_compare_exchange_weak(volatile T *a, } // namespace __sanitizer -// This include provides explicit template instantiations for atomic_uint64_t -// on MIPS32, which does not directly support 8 byte atomics. It has to -// proceed the template definitions above. -#if defined(_MIPS_SIM) && defined(_ABIO32) && _MIPS_SIM == _ABIO32 -# include "sanitizer_atomic_clang_mips.h" -#endif - #undef ATOMIC_ORDER #endif // SANITIZER_ATOMIC_CLANG_H diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h deleted file mode 100644 index f3d3052e5b7c5c..00000000000000 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h +++ /dev/null @@ -1,117 +0,0 @@ -//===-- sanitizer_atomic_clang_mips.h ---------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is a part of ThreadSanitizer/AddressSanitizer runtime. -// Not intended for direct inclusion. Include sanitizer_atomic.h. -// -//===----------------------------------------------------------------------===// - -#ifndef SANITIZER_ATOMIC_CLANG_MIPS_H -#define SANITIZER_ATOMIC_CLANG_MIPS_H - -namespace __sanitizer { - -// MIPS32 does not support atomics > 4 bytes. To address this lack of -// functionality, the sanitizer library provides helper methods which use an -// internal spin lock mechanism to emulate atomic operations when the size is -// 8 bytes. -static void __spin_lock(volatile int *lock) { - while (__sync_lock_test_and_set(lock, 1)) - while (*lock) { - } -} - -static void __spin_unlock(volatile int *lock) { __sync_lock_release(lock); } - -// Make sure the lock is on its own cache line to prevent false sharing. -// Put it inside a struct that is aligned and padded to the typical MIPS -// cacheline which is 32 bytes. -static struct { - int lock; - char pad[32 - sizeof(int)]; -} __attribute__((aligned(32))) lock = {0, {0}}; - -template <> -inline atomic_uint64_t::Type atomic_fetch_add(volatile atomic_uint64_t *ptr, - atomic_uint64_t::Type val, - memory_order mo) { - DCHECK(mo & - (memory_order_relaxed | memory_order_release | memory_order_seq_cst)); - DCHECK(!((uptr)ptr % sizeof(*ptr))); - - atomic_uint64_t::Type ret; - - __spin_lock(&lock.lock); - ret = *(const_cast(&ptr->val_dont_use)); - ptr->val_dont_use = ret + val; - __spin_unlock(&lock.lock); - - return ret; -} - -template <> -inline atomic_uint64_t::Type atomic_fetch_sub(volatile atomic_uint64_t *ptr, - atomic_uint64_t::Type val, - memory_order mo) { - return atomic_fetch_add(ptr, -val, mo); -} - -template <> -inline bool atomic_compare_exchange_strong(volatile atomic_uint64_t *ptr, - atomic_uint64_t::Type *cmp, - atomic_uint64_t::Type xchg, - memory_order mo) { - DCHECK(mo & - (memory_order_relaxed | memory_order_release | memory_order_seq_cst)); - DCHECK(!((uptr)ptr % sizeof(*ptr))); - - typedef atomic_uint64_t::Type Type; - Type cmpv = *cmp; - Type prev; - bool ret = false; - - __spin_lock(&lock.lock); - prev = *(const_cast(&ptr->val_dont_use)); - if (prev == cmpv) { - ret = true; - ptr->val_dont_use = xchg; - } - __spin_unlock(&lock.lock); - - return ret; -} - -template <> -inline atomic_uint64_t::Type atomic_load(const volatile atomic_uint64_t *ptr, - memory_order mo) { - DCHECK(mo & - (memory_order_relaxed | memory_order_release | memory_order_seq_cst)); - DCHECK(!((uptr)ptr % sizeof(*ptr))); - - atomic_uint64_t::Type zero = 0; - volatile atomic_uint64_t *Newptr = - const_cast(ptr); - return atomic_fetch_add(Newptr, zero, mo); -} - -template <> -inline void atomic_store(volatile atomic_uint64_t *ptr, atomic_uint64_t::Type v, - memory_order mo) { - DCHECK(mo & - (memory_order_relaxed | memory_order_release | memory_order_seq_cst)); - DCHECK(!((uptr)ptr % sizeof(*ptr))); - - __spin_lock(&lock.lock); - ptr->val_dont_use = v; - __spin_unlock(&lock.lock); -} - -} // namespace __sanitizer - -#endif // SANITIZER_ATOMIC_CLANG_MIPS_H - diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h deleted file mode 100644 index 557082a636b879..00000000000000 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h +++ /dev/null @@ -1,85 +0,0 @@ -//===-- sanitizer_atomic_clang_other.h --------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is a part of ThreadSanitizer/AddressSanitizer runtime. -// Not intended for direct inclusion. Include sanitizer_atomic.h. -// -//===----------------------------------------------------------------------===// - -#ifndef SANITIZER_ATOMIC_CLANG_OTHER_H -#define SANITIZER_ATOMIC_CLANG_OTHER_H - -namespace __sanitizer { - - -inline void proc_yield(int cnt) { - __asm__ __volatile__("" ::: "memory"); -} - -template -inline typename T::Type atomic_load( - const volatile T *a, memory_order mo) { - DCHECK(mo & (memory_order_relaxed | memory_order_consume - | memory_order_acquire | memory_order_seq_cst)); - DCHECK(!((uptr)a % sizeof(*a))); - typename T::Type v; - - if (sizeof(*a) < 8 || sizeof(void*) == 8) { - // Assume that aligned loads are atomic. - if (mo == memory_order_relaxed) { - v = a->val_dont_use; - } else if (mo == memory_order_consume) { - // Assume that processor respects data dependencies - // (and that compiler won't break them). - __asm__ __volatile__("" ::: "memory"); - v = a->val_dont_use; - __asm__ __volatile__("" ::: "memory"); - } else if (mo == memory_order_acquire) { - __asm__ __volatile__("" ::: "memory"); - v = a->val_dont_use; - __sync_synchronize(); - } else { // seq_cst - // E.g. on POWER we need a hw fence even before the store. - __sync_synchronize(); - v = a->val_dont_use; - __sync_synchronize(); - } - } else { - __atomic_load(const_cast(&a->val_dont_use), &v, - __ATOMIC_SEQ_CST); - } - return v; -} - -template -inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { - DCHECK(mo & (memory_order_relaxed | memory_order_release - | memory_order_seq_cst)); - DCHECK(!((uptr)a % sizeof(*a))); - - if (sizeof(*a) < 8 || sizeof(void*) == 8) { - // Assume that aligned stores are atomic. - if (mo == memory_order_relaxed) { - a->val_dont_use = v; - } else if (mo == memory_order_release) { - __sync_synchronize(); - a->val_dont_use = v; - __asm__ __volatile__("" ::: "memory"); - } else { // seq_cst - __sync_synchronize(); - a->val_dont_use = v; - __sync_synchronize(); - } - } else { - __atomic_store(&a->val_dont_use, &v, __ATOMIC_SEQ_CST); - } -} - -} // namespace __sanitizer - -#endif // #ifndef SANITIZER_ATOMIC_CLANG_OTHER_H diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h deleted file mode 100644 index b81a354d209872..00000000000000 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h +++ /dev/null @@ -1,113 +0,0 @@ -//===-- sanitizer_atomic_clang_x86.h ----------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is a part of ThreadSanitizer/AddressSanitizer runtime. -// Not intended for direct inclusion. Include sanitizer_atomic.h. -// -//===----------------------------------------------------------------------===// - -#ifndef SANITIZER_ATOMIC_CLANG_X86_H -#define SANITIZER_ATOMIC_CLANG_X86_H - -namespace __sanitizer { - -inline void proc_yield(int cnt) { - __asm__ __volatile__("" ::: "memory"); - for (int i = 0; i < cnt; i++) - __asm__ __volatile__("pause"); - __asm__ __volatile__("" ::: "memory"); -} - -template -inline typename T::Type atomic_load( - const volatile T *a, memory_order mo) { - DCHECK(mo & (memory_order_relaxed | memory_order_consume - | memory_order_acquire | memory_order_seq_cst)); - DCHECK(!((uptr)a % sizeof(*a))); - typename T::Type v; - - if (sizeof(*a) < 8 || sizeof(void*) == 8) { - // Assume that aligned loads are atomic. - if (mo == memory_order_relaxed) { - v = a->val_dont_use; - } else if (mo == memory_order_consume) { - // Assume that processor respects data dependencies - // (and that compiler won't break them). - __asm__ __volatile__("" ::: "memory"); - v = a->val_dont_use; - __asm__ __volatile__("" ::: "memory"); - } else if (mo == memory_order_acquire) { - __asm__ __volatile__("" ::: "memory"); - v = a->val_dont_use; - // On x86 loads are implicitly acquire. - __asm__ __volatile__("" ::: "memory"); - } else { // seq_cst - // On x86 plain MOV is enough for seq_cst store. - __asm__ __volatile__("" ::: "memory"); - v = a->val_dont_use; - __asm__ __volatile__("" ::: "memory"); - } - } else { - // 64-bit load on 32-bit platform. - __asm__ __volatile__( - "movq %1, %%mm0;" // Use mmx reg for 64-bit atomic moves - "movq %%mm0, %0;" // (ptr could be read-only) - "emms;" // Empty mmx state/Reset FP regs - : "=m" (v) - : "m" (a->val_dont_use) - : // mark the mmx registers as clobbered -#ifdef __MMX__ - "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", -#endif // #ifdef __MMX__ - "memory"); - } - return v; -} - -template -inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { - DCHECK(mo & (memory_order_relaxed | memory_order_release - | memory_order_seq_cst)); - DCHECK(!((uptr)a % sizeof(*a))); - - if (sizeof(*a) < 8 || sizeof(void*) == 8) { - // Assume that aligned stores are atomic. - if (mo == memory_order_relaxed) { - a->val_dont_use = v; - } else if (mo == memory_order_release) { - // On x86 stores are implicitly release. - __asm__ __volatile__("" ::: "memory"); - a->val_dont_use = v; - __asm__ __volatile__("" ::: "memory"); - } else { // seq_cst - // On x86 stores are implicitly release. - __asm__ __volatile__("" ::: "memory"); - a->val_dont_use = v; - __sync_synchronize(); - } - } else { - // 64-bit store on 32-bit platform. - __asm__ __volatile__( - "movq %1, %%mm0;" // Use mmx reg for 64-bit atomic moves - "movq %%mm0, %0;" - "emms;" // Empty mmx state/Reset FP regs - : "=m" (a->val_dont_use) - : "m" (v) - : // mark the mmx registers as clobbered -#ifdef __MMX__ - "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", -#endif // #ifdef __MMX__ - "memory"); - if (mo == memory_order_seq_cst) - __sync_synchronize(); - } -} - -} // namespace __sanitizer - -#endif // #ifndef SANITIZER_ATOMIC_CLANG_X86_H diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h index 31317adcdfc99f..d80bfdbf6a0812 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h @@ -70,8 +70,8 @@ inline void proc_yield(int cnt) { template inline typename T::Type atomic_load( const volatile T *a, memory_order mo) { - DCHECK(mo & (memory_order_relaxed | memory_order_consume - | memory_order_acquire | memory_order_seq_cst)); + DCHECK(mo == memory_order_relaxed || mo == memory_order_consume || + mo == memory_order_acquire || mo == memory_order_seq_cst); DCHECK(!((uptr)a % sizeof(*a))); typename T::Type v; // FIXME(dvyukov): 64-bit load is not atomic on 32-bits. @@ -87,8 +87,8 @@ inline typename T::Type atomic_load( template inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) { - DCHECK(mo & (memory_order_relaxed | memory_order_release - | memory_order_seq_cst)); + DCHECK(mo == memory_order_relaxed || mo == memory_order_release || + mo == memory_order_seq_cst); DCHECK(!((uptr)a % sizeof(*a))); // FIXME(dvyukov): 64-bit store is not atomic on 32-bits. if (mo == memory_order_relaxed) { diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn index 0519073239430a..f7f1fce10bf5f5 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn @@ -33,9 +33,6 @@ source_set("sources") { "sanitizer_asm.h", "sanitizer_atomic.h", "sanitizer_atomic_clang.h", - "sanitizer_atomic_clang_mips.h", - "sanitizer_atomic_clang_other.h", - "sanitizer_atomic_clang_x86.h", "sanitizer_atomic_msvc.h", "sanitizer_bitvector.h", "sanitizer_bvgraph.h", From a88ea8fbb3953c2fe2887438baf342e381a79d8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 17 Apr 2024 08:43:11 -0700 Subject: [PATCH 272/300] [flang][cuda] Update memory effect on fir.cuda_allocate op (#88930) Add MemRead effect on the box operand as the descriptor might be read when performing the allocation of the data. Also update the expected type of the box operand to be a reference. Check in the verifier that this is a reference to a box or class type. This addresses the comment made post commit on #88586 --- flang/include/flang/Optimizer/Dialect/FIROps.td | 2 +- flang/lib/Optimizer/Dialect/FIROps.cpp | 2 +- flang/test/Fir/cuf-invalid.fir | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index 580e840587abb2..92790a691e4731 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -3200,7 +3200,7 @@ def fir_CUDAAllocateOp : fir_Op<"cuda_allocate", [AttrSizedOperandSegments, is initialized before with the standard flang runtime calls. }]; - let arguments = (ins Arg:$box, + let arguments = (ins Arg:$box, Arg, "", [MemWrite]>:$errmsg, Optional:$stream, Arg, "", [MemWrite]>:$pinned, diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index be27256d911b31..5c24c95db427aa 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -3998,7 +3998,7 @@ mlir::LogicalResult fir::CUDAAllocateOp::verify() { return emitOpError("pinned and stream cannot appears at the same time"); if (!fir::unwrapRefType(getBox().getType()).isa()) return emitOpError( - "expect box to be a reference to/or a class or box type value"); + "expect box to be a reference to a class or box type value"); if (getSource() && !fir::unwrapRefType(getSource().getType()).isa()) return emitOpError( diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir index 5d3aa55cf346a4..6c533a32ccf9ba 100644 --- a/flang/test/Fir/cuf-invalid.fir +++ b/flang/test/Fir/cuf-invalid.fir @@ -16,7 +16,7 @@ func.func @_QPsub1() { func.func @_QPsub1() { %1 = fir.alloca i32 - // expected-error@+1{{'fir.cuda_allocate' op expect box to be a reference to/or a class or box type value}} + // expected-error@+1{{'fir.cuda_allocate' op expect box to be a reference to a class or box type value}} %2 = fir.cuda_allocate %1 : !fir.ref {cuda_attr = #fir.cuda} -> i32 return } From da70f2cdcde8cb96e75ce0236db1fb5353407a69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 17 Apr 2024 08:43:25 -0700 Subject: [PATCH 273/300] [flang][cuda] Lower ALLOCATE for device variable (#88980) Replace the runtime call to `AllocatableAllocate` for CUDA device variable to the newly added `fir.cuda_allocate` operation. --- flang/lib/Lower/Allocatable.cpp | 57 +++++++++-- flang/test/Lower/CUDA/cuda-allocatable.cuf | 107 +++++++++++++++++++++ 2 files changed, 154 insertions(+), 10 deletions(-) create mode 100644 flang/test/Lower/CUDA/cuda-allocatable.cuf diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp index 42e78fc96e4445..1d434d512d0c5c 100644 --- a/flang/lib/Lower/Allocatable.cpp +++ b/flang/lib/Lower/Allocatable.cpp @@ -14,6 +14,7 @@ #include "flang/Evaluate/tools.h" #include "flang/Lower/AbstractConverter.h" #include "flang/Lower/ConvertType.h" +#include "flang/Lower/ConvertVariable.h" #include "flang/Lower/IterationSpace.h" #include "flang/Lower/Mangler.h" #include "flang/Lower/OpenACC.h" @@ -368,20 +369,17 @@ class AllocateStmtHelper { [&](const Fortran::parser::AllocOpt::Mold &mold) { moldExpr = Fortran::semantics::GetExpr(mold.v.value()); }, - [&](const Fortran::parser::AllocOpt::Stream &) { - TODO(loc, "CUDA ALLOCATE(STREAM=)"); + [&](const Fortran::parser::AllocOpt::Stream &stream) { + streamExpr = Fortran::semantics::GetExpr(stream.v.value()); }, - [&](const Fortran::parser::AllocOpt::Pinned &) { - TODO(loc, "CUDA ALLOCATE(PINNED=)"); + [&](const Fortran::parser::AllocOpt::Pinned &pinned) { + pinnedExpr = Fortran::semantics::GetExpr(pinned.v.value()); }, }, allocOption.u); } void lowerAllocation(const Allocation &alloc) { - if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) - TODO(loc, "Allocation of variable with CUDA attributes"); - fir::MutableBoxValue boxAddr = genMutableBoxValue(converter, loc, alloc.getAllocObj()); @@ -456,7 +454,8 @@ class AllocateStmtHelper { const fir::MutableBoxValue &box) { if (!box.isDerived() && !errorManager.hasStatSpec() && !alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() && - !useAllocateRuntime && !box.isPointer()) { + !useAllocateRuntime && !box.isPointer() && + !Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) { // Pointers must use PointerAllocate so that their deallocations // can be validated. genInlinedAllocation(alloc, box); @@ -472,7 +471,12 @@ class AllocateStmtHelper { genSetType(alloc, box, loc); genSetDeferredLengthParameters(alloc, box); genAllocateObjectBounds(alloc, box); - mlir::Value stat = genRuntimeAllocate(builder, loc, box, errorManager); + mlir::Value stat; + if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) + stat = genRuntimeAllocate(builder, loc, box, errorManager); + else + stat = + genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol()); fir::factory::syncMutableBoxFromIRBox(builder, loc, box); postAllocationAction(alloc); errorManager.assignStat(builder, loc, stat); @@ -602,7 +606,10 @@ class AllocateStmtHelper { genSetDeferredLengthParameters(alloc, box); genAllocateObjectBounds(alloc, box); mlir::Value stat; - if (isSource) + if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) + stat = + genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol()); + else if (isSource) stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager); else stat = genRuntimeAllocate(builder, loc, box, errorManager); @@ -717,6 +724,34 @@ class AllocateStmtHelper { return nullptr; } + mlir::Value genCudaAllocate(fir::FirOpBuilder &builder, mlir::Location loc, + const fir::MutableBoxValue &box, + ErrorManager &errorManager, + const Fortran::semantics::Symbol &sym) { + Fortran::lower::StatementContext stmtCtx; + fir::CUDADataAttributeAttr cudaAttr = + Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(), + sym); + mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr; + mlir::Value stream = + streamExpr + ? fir::getBase(converter.genExprValue(loc, *streamExpr, stmtCtx)) + : nullptr; + mlir::Value pinned = + pinnedExpr + ? fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx)) + : nullptr; + mlir::Value source = sourceExpr ? fir::getBase(sourceExv) : nullptr; + + // Keep return type the same as a standard AllocatableAllocate call. + mlir::Type retTy = fir::runtime::getModel()(builder.getContext()); + return builder + .create( + loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr, + errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr) + .getResult(); + } + Fortran::lower::AbstractConverter &converter; fir::FirOpBuilder &builder; const Fortran::parser::AllocateStmt &stmt; @@ -724,6 +759,8 @@ class AllocateStmtHelper { const Fortran::lower::SomeExpr *moldExpr{nullptr}; const Fortran::lower::SomeExpr *statExpr{nullptr}; const Fortran::lower::SomeExpr *errMsgExpr{nullptr}; + const Fortran::lower::SomeExpr *pinnedExpr{nullptr}; + const Fortran::lower::SomeExpr *streamExpr{nullptr}; // If the allocate has a type spec, lenParams contains the // value of the length parameters that were specified inside. llvm::SmallVector lenParams; diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf new file mode 100644 index 00000000000000..55223011e8d9e9 --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf @@ -0,0 +1,107 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test lowering of CUDA allocatable allocate/deallocate statements. + +subroutine sub1() + real, allocatable, device :: a(:) + allocate(a(10)) +end subroutine + +! CHECK-LABEL: func.func @_QPsub1() +! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub1Ea"} +! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub1Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: fir.call @_FortranAAllocatableSetBounds +! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref>>> {cuda_attr = #fir.cuda} -> i32 + +subroutine sub2() + real, allocatable, managed :: a(:) + integer :: istat + allocate(a(10), stat=istat) +end subroutine + +! CHECK-LABEL: func.func @_QPsub2() +! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub2Ea"} +! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub2Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"} +! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: fir.call @_FortranAAllocatableSetBounds +! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref>>> {cuda_attr = #fir.cuda, hasStat} -> i32 +! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref + +subroutine sub3() + integer, allocatable, pinned :: a(:,:) + logical :: plog + allocate(a(20,30), pinned = plog) +end subroutine + +! CHECK-LABEL: func.func @_QPsub3() +! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub3Ea"} +! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub3Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"} +! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) +! CHECK-2: fir.call @_FortranAAllocatableSetBounds +! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref>) {cuda_attr = #fir.cuda} -> i32 + +subroutine sub4() + real, allocatable, unified :: a(:) + integer :: istream + allocate(a(10), stream=istream) +end subroutine + +! CHECK-LABEL: func.func @_QPsub4() +! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub4Ea"} +! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub4Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"} +! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: fir.call @_FortranAAllocatableSetBounds +! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref +! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda} -> i32 + +subroutine sub5() + real, allocatable, device :: a(:) + real, allocatable :: b(:) + allocate(a, source=b) +end subroutine + +! CHECK-LABEL: func.func @_QPsub5() +! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub5Ea"} +! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub5Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box>> {bindc_name = "b", uniq_name = "_QFsub5Eb"} +! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub5Eb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref>>> +! CHECK: fir.call @_FortranAAllocatableSetBounds +! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref>>> source(%[[LOAD_B]] : !fir.box>>) {cuda_attr = #fir.cuda} -> i32 + +subroutine sub6() + real, allocatable, device :: a(:) + real, allocatable :: b(:) + allocate(a, mold=b) +end subroutine + +! CHECK-LABEL: func.func @_QPsub6() +! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub6Ea"} +! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub6Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box>> {bindc_name = "b", uniq_name = "_QFsub6Eb"} +! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub6Eb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref>>> +! CHECK: fir.call @_FortranAAllocatableApplyMold +! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref>>> {cuda_attr = #fir.cuda} -> i32 + +subroutine sub7() + real, allocatable, device :: a(:) + integer :: istat + character(50) :: err + allocate(a(100), stat=istat, errmsg=err) +end subroutine + +! CHECK-LABEL: func.func @_QPsub7() +! CHECK: %[[BOX:.*]] = fir.alloca !fir.box>> {bindc_name = "a", uniq_name = "_QFsub7Ea"} +! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub7Ea"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"} +! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub7Eistat"} +! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub7Eistat"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref>) -> !fir.box> +! CHECK: fir.call @_FortranAAllocatableSetBounds +! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref>>> errmsg(%[[ERR_BOX]] : !fir.box>) {cuda_attr = #fir.cuda, hasStat} -> i32 +! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref From 19c6a7feca6e1558ef7cbe18efd2477c1126899d Mon Sep 17 00:00:00 2001 From: Alexandros Lamprineas Date: Wed, 17 Apr 2024 17:16:58 +0100 Subject: [PATCH 274/300] [FMV] Remove useless features according the latest ACLE spec. (#88965) As explained in https://github.com/ARM-software/acle/pull/315 we are deprecating features which aren't adding any value. These are: sha1, pmull, dit, dgh, ebf16, sve-bf16, sve-ebf16, sve-i8mm, sve2-pmull128, memtag2, memtag3, ssbs2, bti, ls64_v, ls64_accdata --- clang/test/CodeGen/aarch64-cpu-supports.c | 24 +- .../CodeGen/aarch64-mixed-target-attributes.c | 12 +- .../test/CodeGen/attr-target-clones-aarch64.c | 77 ++--- clang/test/CodeGen/attr-target-version.c | 326 +++++++++--------- .../CodeGenCXX/attr-target-clones-aarch64.cpp | 41 +-- clang/test/CodeGenCXX/attr-target-version.cpp | 22 +- clang/test/Sema/aarch64-cpu-supports.c | 2 +- clang/test/Sema/attr-target-clones-aarch64.c | 21 +- clang/test/Sema/attr-target-version.c | 18 +- clang/test/SemaCXX/attr-target-version.cpp | 16 +- compiler-rt/lib/builtins/cpu_model/aarch64.c | 17 +- .../builtins/cpu_model/aarch64/fmv/apple.inc | 6 +- .../cpu_model/aarch64/fmv/fuchsia.inc | 4 - .../builtins/cpu_model/aarch64/fmv/mrs.inc | 42 +-- .../llvm/TargetParser/AArch64TargetParser.h | 36 +- 15 files changed, 287 insertions(+), 377 deletions(-) diff --git a/clang/test/CodeGen/aarch64-cpu-supports.c b/clang/test/CodeGen/aarch64-cpu-supports.c index c54b7475a3fd5f..7fad9724dfb6c5 100644 --- a/clang/test/CodeGen/aarch64-cpu-supports.c +++ b/clang/test/CodeGen/aarch64-cpu-supports.c @@ -1,15 +1,17 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 2 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s +//. // CHECK: @__aarch64_cpu_features = external dso_local global { i64 } +//. // CHECK-LABEL: define dso_local i32 @main // CHECK-SAME: () #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 70368744177664 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70368744177664 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 34359738368 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 34359738368 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] // CHECK: if.then: @@ -17,8 +19,8 @@ // CHECK-NEXT: br label [[RETURN:%.*]] // CHECK: if.end: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 9070970929152 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 9070970929152 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 17716740096 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 17716740096 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[IF_THEN1:%.*]], label [[IF_END2:%.*]] // CHECK: if.then1: @@ -26,8 +28,8 @@ // CHECK-NEXT: br label [[RETURN]] // CHECK: if.end2: // CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 166633186212708352 -// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 166633186212708352 +// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 5222680231936 +// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 5222680231936 // CHECK-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] // CHECK-NEXT: br i1 [[TMP11]], label [[IF_THEN3:%.*]], label [[IF_END4:%.*]] // CHECK: if.then3: @@ -49,10 +51,10 @@ int main(void) { if (__builtin_cpu_supports("sb")) return 1; - if (__builtin_cpu_supports("sve2-pmull128+memtag")) + if (__builtin_cpu_supports("sve2-aes+memtag")) return 2; - if (__builtin_cpu_supports("sme2+ls64_v+wfxt")) + if (__builtin_cpu_supports("sme2+ls64+wfxt")) return 3; if (__builtin_cpu_supports("avx2")) @@ -60,3 +62,9 @@ int main(void) { return 0; } +//. +// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +//. +// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +//. diff --git a/clang/test/CodeGen/aarch64-mixed-target-attributes.c b/clang/test/CodeGen/aarch64-mixed-target-attributes.c index aef6ce36ab1c05..be290ff9ecee67 100644 --- a/clang/test/CodeGen/aarch64-mixed-target-attributes.c +++ b/clang/test/CodeGen/aarch64-mixed-target-attributes.c @@ -69,8 +69,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1048576 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 131072 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -143,8 +143,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1048576 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 131072 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -210,8 +210,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1048576 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 131072 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c index 8c8b951e9118d7..c715001f6a722f 100644 --- a/clang/test/CodeGen/attr-target-clones-aarch64.c +++ b/clang/test/CodeGen/attr-target-clones-aarch64.c @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -S -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV int __attribute__((target_clones("lse+aes", "sve2"))) ftc(void) { return 0; } -int __attribute__((target_clones("sha2", "sha2+memtag2", " default "))) ftc_def(void) { return 1; } +int __attribute__((target_clones("sha2", "sha2+memtag", " default "))) ftc_def(void) { return 1; } int __attribute__((target_clones("sha2", "default"))) ftc_dup1(void) { return 2; } int __attribute__((target_clones("fp", "crc+dotprod"))) ftc_dup2(void) { return 3; } int foo() { @@ -12,7 +12,7 @@ int foo() { inline int __attribute__((target_clones("rng+simd", "rcpc+predres", "sve2-aes+wfxt"))) ftc_inline1(void) { return 1; } inline int __attribute__((target_clones("fp16", "fcma+sve2-bitperm", "default"))) ftc_inline2(void); -inline int __attribute__((target_clones("bti", "sve+sb"))) ftc_inline3(void) { return 3; } +inline int __attribute__((target_clones("mops", "sve+sb"))) ftc_inline3(void) { return 3; } int __attribute__((target_clones("default"))) ftc_direct(void) { return 4; } @@ -56,16 +56,16 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 16512 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 16512 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 8320 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 8320 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: // CHECK-NEXT: ret ptr @ftc._MaesMlse // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 68719476736 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 68719476736 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 268435456 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 268435456 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: @@ -81,7 +81,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: @ftc_def._Mmemtag2Msha2( +// CHECK-LABEL: @ftc_def._MmemtagMsha2( // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 // @@ -90,16 +90,16 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 17592186048512 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 17592186048512 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 17179871232 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 17179871232 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: -// CHECK-NEXT: ret ptr @ftc_def._Mmemtag2Msha2 +// CHECK-NEXT: ret ptr @ftc_def._MmemtagMsha2 // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 4096 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 4096 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 2048 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 2048 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: @@ -118,8 +118,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 4096 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4096 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 2048 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2048 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -198,16 +198,16 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 550292684800 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 550292684800 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: // CHECK-NEXT: ret ptr @ftc_inline1._Msve2-aesMwfxt // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 68720001024 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 68720001024 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: @@ -228,16 +228,16 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1074003968 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1074003968 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: // CHECK-NEXT: ret ptr @ftc_inline2._MfcmaMsve2-bitperm // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 65536 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 16384 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16384 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: @@ -250,20 +250,20 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 34393292800 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 34393292800 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: // CHECK-NEXT: ret ptr @ftc_inline3._MsbMsve // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 17592186044416 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 17592186044416 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @ftc_inline3._Mbti +// CHECK-NEXT: ret ptr @ftc_inline3._Mmops // CHECK: resolver_else2: // CHECK-NEXT: ret ptr @ftc_inline3.default // @@ -329,7 +329,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: @ftc_inline3._Mbti( +// CHECK-LABEL: @ftc_inline3._Mmops( // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 3 // @@ -407,17 +407,16 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default")) // CHECK: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon" } // CHECK: attributes #[[ATTR1:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2" } // CHECK: attributes #[[ATTR2:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sha2" } -// CHECK: attributes #[[ATTR3:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+mte,+neon,+sha2" } -// CHECK: attributes #[[ATTR4:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" } -// CHECK: attributes #[[ATTR5:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+dotprod,+fp-armv8,+neon" } -// CHECK: attributes #[[ATTR6:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// CHECK: attributes #[[ATTR7:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" } -// CHECK: attributes #[[ATTR8:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-bitperm" } -// CHECK: attributes #[[ATTR9:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rand" } -// CHECK: attributes #[[ATTR10:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+predres,+rcpc" } -// CHECK: attributes #[[ATTR11:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+wfxt" } -// CHECK: attributes #[[ATTR12:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti" } -// CHECK: attributes #[[ATTR13:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sb,+sve" } +// CHECK: attributes #[[ATTR3:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" } +// CHECK: attributes #[[ATTR4:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+dotprod,+fp-armv8,+neon" } +// CHECK: attributes #[[ATTR5:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #[[ATTR6:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" } +// CHECK: attributes #[[ATTR7:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-bitperm" } +// CHECK: attributes #[[ATTR8:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rand" } +// CHECK: attributes #[[ATTR9:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+predres,+rcpc" } +// CHECK: attributes #[[ATTR10:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+wfxt" } +// CHECK: attributes #[[ATTR11:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" } +// CHECK: attributes #[[ATTR12:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sb,+sve" } //. // CHECK-NOFMV: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" } // CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" } diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c index dd4cbbf5a89860..e71370e6d91df1 100644 --- a/clang/test/CodeGen/attr-target-version.c +++ b/clang/test/CodeGen/attr-target-version.c @@ -5,11 +5,11 @@ int __attribute__((target_version("rng+flagm+fp16fml"))) fmv(void) { return 1; } int __attribute__((target_version("flagm2+sme-i16i64"))) fmv(void) { return 2; } int __attribute__((target_version("lse+sha2"))) fmv(void) { return 3; } -int __attribute__((target_version("dotprod+ls64_accdata"))) fmv(void) { return 4; } +int __attribute__((target_version("dotprod+ls64"))) fmv(void) { return 4; } int __attribute__((target_version("fp16fml+memtag"))) fmv(void) { return 5; } int __attribute__((target_version("fp+aes"))) fmv(void) { return 6; } -int __attribute__((target_version("crc+ls64_v"))) fmv(void) { return 7; } -int __attribute__((target_version("bti"))) fmv(void) { return 8; } +int __attribute__((target_version("crc+ls64"))) fmv(void) { return 7; } +int __attribute__((target_version("mops"))) fmv(void) { return 8; } int __attribute__((target_version("sme2"))) fmv(void) { return 9; } int __attribute__((target_version("default"))) fmv(void); int __attribute__((target_version("ls64+simd"))) fmv_one(void) { return 1; } @@ -17,25 +17,25 @@ int __attribute__((target_version("dpb"))) fmv_one(void) { return 2; } int __attribute__((target_version("default"))) fmv_one(void); int __attribute__((target_version("fp"))) fmv_two(void) { return 1; } int __attribute__((target_version("simd"))) fmv_two(void) { return 2; } -int __attribute__((target_version("dgh"))) fmv_two(void) { return 3; } +int __attribute__((target_version("frintts"))) fmv_two(void) { return 3; } int __attribute__((target_version("fp16+simd"))) fmv_two(void) { return 4; } int __attribute__((target_version("default"))) fmv_two(void); int foo() { return fmv()+fmv_one()+fmv_two(); } -inline int __attribute__((target_version("sha1+pmull+f64mm"))) fmv_inline(void) { return 1; } +inline int __attribute__((target_version("crypto+f64mm"))) fmv_inline(void) { return 1; } inline int __attribute__((target_version("fp16+fcma+rdma+sme+ fp16 "))) fmv_inline(void) { return 2; } inline int __attribute__((target_version("sha3+i8mm+f32mm"))) fmv_inline(void) { return 12; } -inline int __attribute__((target_version("dit+sve-ebf16"))) fmv_inline(void) { return 8; } +inline int __attribute__((target_version("sme2+ssbs"))) fmv_inline(void) { return 8; } inline int __attribute__((target_version("dpb+rcpc2 "))) fmv_inline(void) { return 6; } inline int __attribute__((target_version(" dpb2 + jscvt"))) fmv_inline(void) { return 7; } inline int __attribute__((target_version("rcpc+frintts"))) fmv_inline(void) { return 3; } -inline int __attribute__((target_version("sve+sve-bf16"))) fmv_inline(void) { return 4; } +inline int __attribute__((target_version("sve+sme"))) fmv_inline(void) { return 4; } inline int __attribute__((target_version("sve2-aes+sve2-sha3"))) fmv_inline(void) { return 5; } -inline int __attribute__((target_version("sve2+sve2-pmull128+sve2-bitperm"))) fmv_inline(void) { return 9; } -inline int __attribute__((target_version("sve2-sm4+memtag2"))) fmv_inline(void) { return 10; } -inline int __attribute__((target_version("memtag3+rcpc3+mops"))) fmv_inline(void) { return 11; } +inline int __attribute__((target_version("sve2+sve2-aes+sve2-bitperm"))) fmv_inline(void) { return 9; } +inline int __attribute__((target_version("sve2-sm4+memtag"))) fmv_inline(void) { return 10; } +inline int __attribute__((target_version("memtag+rcpc3+mops"))) fmv_inline(void) { return 11; } inline int __attribute__((target_version("aes+dotprod"))) fmv_inline(void) { return 13; } inline int __attribute__((target_version("simd+fp16fml"))) fmv_inline(void) { return 14; } inline int __attribute__((target_version("fp+sm4"))) fmv_inline(void) { return 15; } @@ -186,7 +186,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv._MdotprodMls64_accdata +// CHECK-LABEL: define {{[^@]+}}@fmv._MdotprodMls64 // CHECK-SAME: () #[[ATTR3:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 4 @@ -207,14 +207,14 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv._McrcMls64_v +// CHECK-LABEL: define {{[^@]+}}@fmv._McrcMls64 // CHECK-SAME: () #[[ATTR6:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 7 // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv._Mbti +// CHECK-LABEL: define {{[^@]+}}@fmv._Mmops // CHECK-SAME: () #[[ATTR7:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 8 @@ -256,7 +256,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mdgh +// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfrintts // CHECK-SAME: () #[[ATTR11:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 3 @@ -271,7 +271,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@foo -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[CALL:%.*]] = call i32 @fmv() // CHECK-NEXT: [[CALL1:%.*]] = call i32 @fmv_one() @@ -293,68 +293,68 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: ret ptr @fmv._MflagmMfp16fmlMrng // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927940 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 72057594037927940 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 2199023255556 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 2199023255556 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: // CHECK-NEXT: ret ptr @fmv._Mflagm2Msme-i16i64 // CHECK: resolver_else2: // CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 9007199254741008 -// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 9007199254741008 +// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 274877906960 +// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 274877906960 // CHECK-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] // CHECK-NEXT: br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] // CHECK: resolver_return3: -// CHECK-NEXT: ret ptr @fmv._MdotprodMls64_accdata +// CHECK-NEXT: ret ptr @fmv._MdotprodMls64 // CHECK: resolver_else4: // CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 4503599627371520 -// CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 4503599627371520 +// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 274877907968 +// CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 274877907968 // CHECK-NEXT: [[TMP15:%.*]] = and i1 true, [[TMP14]] // CHECK-NEXT: br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]] // CHECK: resolver_return5: -// CHECK-NEXT: ret ptr @fmv._McrcMls64_v +// CHECK-NEXT: ret ptr @fmv._McrcMls64 // CHECK: resolver_else6: // CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 8796093022216 -// CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 8796093022216 +// CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 17179869192 +// CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 17179869192 // CHECK-NEXT: [[TMP19:%.*]] = and i1 true, [[TMP18]] // CHECK-NEXT: br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]] // CHECK: resolver_return7: // CHECK-NEXT: ret ptr @fmv._Mfp16fmlMmemtag // CHECK: resolver_else8: // CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 16640 -// CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 16640 +// CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 8448 +// CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 8448 // CHECK-NEXT: [[TMP23:%.*]] = and i1 true, [[TMP22]] // CHECK-NEXT: br i1 [[TMP23]], label [[RESOLVER_RETURN9:%.*]], label [[RESOLVER_ELSE10:%.*]] // CHECK: resolver_return9: // CHECK-NEXT: ret ptr @fmv._MaesMfp // CHECK: resolver_else10: // CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP24]], 4224 -// CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 4224 +// CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP24]], 2176 +// CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 2176 // CHECK-NEXT: [[TMP27:%.*]] = and i1 true, [[TMP26]] // CHECK-NEXT: br i1 [[TMP27]], label [[RESOLVER_RETURN11:%.*]], label [[RESOLVER_ELSE12:%.*]] // CHECK: resolver_return11: // CHECK-NEXT: ret ptr @fmv._MlseMsha2 // CHECK: resolver_else12: // CHECK-NEXT: [[TMP28:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP29:%.*]] = and i64 [[TMP28]], 144115188075855872 -// CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 144115188075855872 +// CHECK-NEXT: [[TMP29:%.*]] = and i64 [[TMP28]], 17592186044416 +// CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 17592186044416 // CHECK-NEXT: [[TMP31:%.*]] = and i1 true, [[TMP30]] // CHECK-NEXT: br i1 [[TMP31]], label [[RESOLVER_RETURN13:%.*]], label [[RESOLVER_ELSE14:%.*]] // CHECK: resolver_return13: -// CHECK-NEXT: ret ptr @fmv._Msme2 +// CHECK-NEXT: ret ptr @fmv._Mmops // CHECK: resolver_else14: // CHECK-NEXT: [[TMP32:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP33:%.*]] = and i64 [[TMP32]], 1125899906842624 -// CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 1125899906842624 +// CHECK-NEXT: [[TMP33:%.*]] = and i64 [[TMP32]], 4398046511104 +// CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 4398046511104 // CHECK-NEXT: [[TMP35:%.*]] = and i1 true, [[TMP34]] // CHECK-NEXT: br i1 [[TMP35]], label [[RESOLVER_RETURN15:%.*]], label [[RESOLVER_ELSE16:%.*]] // CHECK: resolver_return15: -// CHECK-NEXT: ret ptr @fmv._Mbti +// CHECK-NEXT: ret ptr @fmv._Msme2 // CHECK: resolver_else16: // CHECK-NEXT: ret ptr @fmv.default // @@ -363,16 +363,16 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 2251799813685760 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2251799813685760 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 274877907456 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 274877907456 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: // CHECK-NEXT: ret ptr @fmv_one._Mls64Msimd // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 262144 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 262144 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 32768 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 32768 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: @@ -385,20 +385,20 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 66048 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 66048 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 16896 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 16896 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: // CHECK-NEXT: ret ptr @fmv_two._Mfp16Msimd // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 33554432 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 33554432 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 2097152 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 2097152 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @fmv_two._Mdgh +// CHECK-NEXT: ret ptr @fmv_two._Mfrintts // CHECK: resolver_else2: // CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 // CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 512 @@ -421,49 +421,49 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_e.default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 20 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_d._Msb -// CHECK-SAME: () #[[ATTR13:[0-9]+]] { +// CHECK-SAME: () #[[ATTR14:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 0 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_d.default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 111 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret void // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_c.default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret void // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@goo -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[CALL:%.*]] = call i32 @fmv_inline() // CHECK-NEXT: [[CALL1:%.*]] = call i32 @fmv_e() @@ -477,96 +477,96 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 4398048673856 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4398048673856 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 8590213184 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 8590213184 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: // CHECK-NEXT: ret ptr @fmv_inline._MfcmaMfp16MrdmMsme // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 864726312827224064 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 864726312827224064 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 26405458935808 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 26405458935808 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: -// CHECK-NEXT: ret ptr @fmv_inline._Mmemtag3MmopsMrcpc3 +// CHECK-NEXT: ret ptr @fmv_inline._MmemtagMmopsMrcpc3 // CHECK: resolver_else2: // CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 893353197568 -// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 893353197568 +// CHECK-NEXT: [[TMP9:%.*]] = and i64 [[TMP8]], 1879048192 +// CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 1879048192 // CHECK-NEXT: [[TMP11:%.*]] = and i1 true, [[TMP10]] // CHECK-NEXT: br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]] // CHECK: resolver_return3: -// CHECK-NEXT: ret ptr @fmv_inline._Msve2Msve2-bitpermMsve2-pmull128 +// CHECK-NEXT: ret ptr @fmv_inline._Msve2Msve2-aesMsve2-bitperm // CHECK: resolver_else4: // CHECK-NEXT: [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 34359773184 -// CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 34359773184 +// CHECK-NEXT: [[TMP13:%.*]] = and i64 [[TMP12]], 71307264 +// CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 71307264 // CHECK-NEXT: [[TMP15:%.*]] = and i1 true, [[TMP14]] // CHECK-NEXT: br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]] // CHECK: resolver_return5: -// CHECK-NEXT: ret ptr @fmv_inline._Mf64mmMpmullMsha1 +// CHECK-NEXT: ret ptr @fmv_inline._Mf32mmMi8mmMsha3 // CHECK: resolver_else6: // CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 17246986240 -// CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 17246986240 +// CHECK-NEXT: [[TMP17:%.*]] = and i64 [[TMP16]], 4535485464576 +// CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 4535485464576 // CHECK-NEXT: [[TMP19:%.*]] = and i1 true, [[TMP18]] // CHECK-NEXT: br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]] // CHECK: resolver_return7: -// CHECK-NEXT: ret ptr @fmv_inline._Mf32mmMi8mmMsha3 +// CHECK-NEXT: ret ptr @fmv_inline._Msme2Mssbs // CHECK: resolver_else8: // CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 19791209299968 -// CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 19791209299968 +// CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP20]], 21474836480 +// CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 21474836480 // CHECK-NEXT: [[TMP23:%.*]] = and i1 true, [[TMP22]] // CHECK-NEXT: br i1 [[TMP23]], label [[RESOLVER_RETURN9:%.*]], label [[RESOLVER_ELSE10:%.*]] // CHECK: resolver_return9: -// CHECK-NEXT: ret ptr @fmv_inline._Mmemtag2Msve2-sm4 +// CHECK-NEXT: ret ptr @fmv_inline._MmemtagMsve2-sm4 // CHECK: resolver_else10: // CHECK-NEXT: [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP24]], 1236950581248 -// CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 1236950581248 +// CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP24]], 8623489024 +// CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 8623489024 // CHECK-NEXT: [[TMP27:%.*]] = and i1 true, [[TMP26]] // CHECK-NEXT: br i1 [[TMP27]], label [[RESOLVER_RETURN11:%.*]], label [[RESOLVER_ELSE12:%.*]] // CHECK: resolver_return11: -// CHECK-NEXT: ret ptr @fmv_inline._Msve2-aesMsve2-sha3 +// CHECK-NEXT: ret ptr @fmv_inline._MsmeMsve // CHECK: resolver_else12: // CHECK-NEXT: [[TMP28:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP29:%.*]] = and i64 [[TMP28]], 4295098368 -// CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 4295098368 +// CHECK-NEXT: [[TMP29:%.*]] = and i64 [[TMP28]], 2684354560 +// CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 2684354560 // CHECK-NEXT: [[TMP31:%.*]] = and i1 true, [[TMP30]] // CHECK-NEXT: br i1 [[TMP31]], label [[RESOLVER_RETURN13:%.*]], label [[RESOLVER_ELSE14:%.*]] // CHECK: resolver_return13: -// CHECK-NEXT: ret ptr @fmv_inline._MditMsve-ebf16 +// CHECK-NEXT: ret ptr @fmv_inline._Msve2-aesMsve2-sha3 // CHECK: resolver_else14: // CHECK-NEXT: [[TMP32:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP33:%.*]] = and i64 [[TMP32]], 3221225472 -// CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 3221225472 +// CHECK-NEXT: [[TMP33:%.*]] = and i64 [[TMP32]], 281475110928384 +// CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 281475110928384 // CHECK-NEXT: [[TMP35:%.*]] = and i1 true, [[TMP34]] // CHECK-NEXT: br i1 [[TMP35]], label [[RESOLVER_RETURN15:%.*]], label [[RESOLVER_ELSE16:%.*]] // CHECK: resolver_return15: -// CHECK-NEXT: ret ptr @fmv_inline._MsveMsve-bf16 +// CHECK-NEXT: ret ptr @fmv_inline._McryptoMf64mm // CHECK: resolver_else16: // CHECK-NEXT: [[TMP36:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP37:%.*]] = and i64 [[TMP36]], 20971520 -// CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 20971520 +// CHECK-NEXT: [[TMP37:%.*]] = and i64 [[TMP36]], 2621440 +// CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 2621440 // CHECK-NEXT: [[TMP39:%.*]] = and i1 true, [[TMP38]] // CHECK-NEXT: br i1 [[TMP39]], label [[RESOLVER_RETURN17:%.*]], label [[RESOLVER_ELSE18:%.*]] // CHECK: resolver_return17: // CHECK-NEXT: ret ptr @fmv_inline._MfrinttsMrcpc // CHECK: resolver_else18: // CHECK-NEXT: [[TMP40:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP41:%.*]] = and i64 [[TMP40]], 8650752 -// CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[TMP41]], 8650752 +// CHECK-NEXT: [[TMP41:%.*]] = and i64 [[TMP40]], 1081344 +// CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[TMP41]], 1081344 // CHECK-NEXT: [[TMP43:%.*]] = and i1 true, [[TMP42]] // CHECK-NEXT: br i1 [[TMP43]], label [[RESOLVER_RETURN19:%.*]], label [[RESOLVER_ELSE20:%.*]] // CHECK: resolver_return19: // CHECK-NEXT: ret ptr @fmv_inline._MdpbMrcpc2 // CHECK: resolver_else20: // CHECK-NEXT: [[TMP44:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP45:%.*]] = and i64 [[TMP44]], 1572864 -// CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[TMP45]], 1572864 +// CHECK-NEXT: [[TMP45:%.*]] = and i64 [[TMP44]], 196608 +// CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[TMP45]], 196608 // CHECK-NEXT: [[TMP47:%.*]] = and i1 true, [[TMP46]] // CHECK-NEXT: br i1 [[TMP47]], label [[RESOLVER_RETURN21:%.*]], label [[RESOLVER_ELSE22:%.*]] // CHECK: resolver_return21: @@ -581,8 +581,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: ret ptr @fmv_inline._Mfp16fmlMsimd // CHECK: resolver_else24: // CHECK-NEXT: [[TMP52:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP53:%.*]] = and i64 [[TMP52]], 16400 -// CHECK-NEXT: [[TMP54:%.*]] = icmp eq i64 [[TMP53]], 16400 +// CHECK-NEXT: [[TMP53:%.*]] = and i64 [[TMP52]], 8208 +// CHECK-NEXT: [[TMP54:%.*]] = icmp eq i64 [[TMP53]], 8208 // CHECK-NEXT: [[TMP55:%.*]] = and i1 true, [[TMP54]] // CHECK-NEXT: br i1 [[TMP55]], label [[RESOLVER_RETURN25:%.*]], label [[RESOLVER_ELSE26:%.*]] // CHECK: resolver_return25: @@ -611,8 +611,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 2251799813685248 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2251799813685248 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 274877906944 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 274877906944 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -625,8 +625,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 70368744177664 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70368744177664 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 34359738368 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 34359738368 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -639,8 +639,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 137438953472 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 137438953472 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -651,7 +651,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@recur -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: call void @reca() // CHECK-NEXT: ret void @@ -659,7 +659,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@main -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 @@ -670,7 +670,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@hoo -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FP1:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[FP2:%.*]] = alloca ptr, align 8 @@ -687,14 +687,14 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@unused_with_forward_default_decl._Mmops -// CHECK-SAME: () #[[ATTR14:[0-9]+]] { +// CHECK-SAME: () #[[ATTR7]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 0 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_extern_forward_default_decl._Mdotprod -// CHECK-SAME: () #[[ATTR15:[0-9]+]] { +// CHECK-SAME: () #[[ATTR3]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 0 // @@ -708,14 +708,14 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@unused_with_default_def._Msve -// CHECK-SAME: () #[[ATTR16:[0-9]+]] { +// CHECK-SAME: () #[[ATTR15:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 0 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@unused_with_default_def.default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 // @@ -729,56 +729,56 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def.default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def.default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 0 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def._Mlse -// CHECK-SAME: () #[[ATTR17:[0-9]+]] { +// CHECK-SAME: () #[[ATTR16:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@unused_without_default._Mrdm -// CHECK-SAME: () #[[ATTR18:[0-9]+]] { +// CHECK-SAME: () #[[ATTR17:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 0 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 0 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mjscvt -// CHECK-SAME: () #[[ATTR21:[0-9]+]] { +// CHECK-SAME: () #[[ATTR20:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mrdm -// CHECK-SAME: () #[[ATTR18]] { +// CHECK-SAME: () #[[ATTR17]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 2 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@caller -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[CALL:%.*]] = call i32 @used_def_without_default_decl() // CHECK-NEXT: [[CALL1:%.*]] = call i32 @used_decl_without_default_decl() @@ -790,8 +790,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1048576 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 131072 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -812,8 +812,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1048576 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 131072 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -831,92 +831,92 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf64mmMpmullMsha1 -// CHECK-SAME: () #[[ATTR22:[0-9]+]] { +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._McryptoMf64mm +// CHECK-SAME: () #[[ATTR21:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 1 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfcmaMfp16MrdmMsme -// CHECK-SAME: () #[[ATTR23:[0-9]+]] { +// CHECK-SAME: () #[[ATTR22:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 2 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf32mmMi8mmMsha3 -// CHECK-SAME: () #[[ATTR24:[0-9]+]] { +// CHECK-SAME: () #[[ATTR23:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 12 // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MditMsve-ebf16 -// CHECK-SAME: () #[[ATTR25:[0-9]+]] { +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msme2Mssbs +// CHECK-SAME: () #[[ATTR8]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 8 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MdpbMrcpc2 -// CHECK-SAME: () #[[ATTR26:[0-9]+]] { +// CHECK-SAME: () #[[ATTR24:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 6 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mdpb2Mjscvt -// CHECK-SAME: () #[[ATTR27:[0-9]+]] { +// CHECK-SAME: () #[[ATTR25:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 7 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfrinttsMrcpc -// CHECK-SAME: () #[[ATTR28:[0-9]+]] { +// CHECK-SAME: () #[[ATTR26:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 3 // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MsveMsve-bf16 -// CHECK-SAME: () #[[ATTR29:[0-9]+]] { +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MsmeMsve +// CHECK-SAME: () #[[ATTR27:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 4 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2-aesMsve2-sha3 -// CHECK-SAME: () #[[ATTR30:[0-9]+]] { +// CHECK-SAME: () #[[ATTR28:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 5 // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2Msve2-bitpermMsve2-pmull128 -// CHECK-SAME: () #[[ATTR31:[0-9]+]] { +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2Msve2-aesMsve2-bitperm +// CHECK-SAME: () #[[ATTR29:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 9 // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag2Msve2-sm4 -// CHECK-SAME: () #[[ATTR32:[0-9]+]] { +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MmemtagMsve2-sm4 +// CHECK-SAME: () #[[ATTR30:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 10 // // // CHECK: Function Attrs: noinline nounwind optnone -// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag3MmopsMrcpc3 -// CHECK-SAME: () #[[ATTR33:[0-9]+]] { +// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MmemtagMmopsMrcpc3 +// CHECK-SAME: () #[[ATTR31:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 11 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MaesMdotprod -// CHECK-SAME: () #[[ATTR15]] { +// CHECK-SAME: () #[[ATTR3]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 13 // @@ -930,21 +930,21 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfpMsm4 -// CHECK-SAME: () #[[ATTR34:[0-9]+]] { +// CHECK-SAME: () #[[ATTR32:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 15 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MlseMrdm -// CHECK-SAME: () #[[ATTR35:[0-9]+]] { +// CHECK-SAME: () #[[ATTR33:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 16 // // // CHECK: Function Attrs: noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@fmv_inline.default -// CHECK-SAME: () #[[ATTR11]] { +// CHECK-SAME: () #[[ATTR13]] { // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 3 // @@ -953,8 +953,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 33554432 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 33554432 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -967,8 +967,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 65536 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 65536 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 16384 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 16384 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -995,8 +995,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1048576 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 131072 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -1132,39 +1132,37 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de // CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp16fml,+fullfp16,+neon,+rand,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+altnzcv,+bf16,+flagm,+sme,+sme-i16i64,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+sha2,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+ls64,+neon,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+neon,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp16fml,+fullfp16,+neon,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme2,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR9:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,-fp-armv8,-v9.5a" } // CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+neon,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR20:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fullfp16,+neon,+rdm,+sme,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+jsconv,+neon,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+mte,+rcpc,+rcpc3,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+sm4,-fp-armv8,-v9.5a" } -// CHECK: attributes #[[ATTR35]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+rdm,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR18:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR20]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fullfp16,+neon,+sha2,+sve,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fullfp16,+neon,+rdm,+sme,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+jsconv,+neon,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fullfp16,+neon,+sme,+sve,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-sm4,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+rcpc,+rcpc3,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+sm4,-fp-armv8,-v9.5a" } +// CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+rdm,-fp-armv8,-v9.5a" } //. // CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" } // CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" } diff --git a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp index 7953f902bf09b2..d439f96982eb41 100644 --- a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp +++ b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs // RUN: %clang_cc1 -std=c++11 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s -int __attribute__((target_clones("ls64_v+fp16", "default"))) foo_ovl(int) { return 1; } -int __attribute__((target_clones("ls64_accdata+ls64"))) foo_ovl(void) { return 2; } +int __attribute__((target_clones("ls64+fp16", "default"))) foo_ovl(int) { return 1; } +int __attribute__((target_clones("sme+ls64"))) foo_ovl(void) { return 2; } int bar() { return foo_ovl(1) + foo_ovl(); @@ -35,9 +35,6 @@ void run_foo_tml() { Mc4.foo_tml(); } - - - //. // CHECK: @__aarch64_cpu_features = external dso_local global { i64 } // CHECK: @_Z7foo_ovli.ifunc = weak_odr alias i32 (i32), ptr @_Z7foo_ovli @@ -49,7 +46,7 @@ void run_foo_tml() { // CHECK: @_ZN7MyClassIssE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIssE7foo_tmlEv.resolver // CHECK: @_ZN7MyClassIisE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIisE7foo_tmlEv.resolver //. -// CHECK-LABEL: @_Z7foo_ovli._Mfp16Mls64_v( +// CHECK-LABEL: @_Z7foo_ovli._Mfp16Mls64( // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 // CHECK-NEXT: store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4 @@ -60,17 +57,17 @@ void run_foo_tml() { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 4503599627436032 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4503599627436032 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 274877923328 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 274877923328 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: -// CHECK-NEXT: ret ptr @_Z7foo_ovli._Mfp16Mls64_v +// CHECK-NEXT: ret ptr @_Z7foo_ovli._Mfp16Mls64 // CHECK: resolver_else: // CHECK-NEXT: ret ptr @_Z7foo_ovli.default // // -// CHECK-LABEL: @_Z7foo_ovlv._Mls64Mls64_accdata( +// CHECK-LABEL: @_Z7foo_ovlv._Mls64Msme( // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 2 // @@ -79,12 +76,12 @@ void run_foo_tml() { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 11258999068426240 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 11258999068426240 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 283467841536 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 283467841536 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: -// CHECK-NEXT: ret ptr @_Z7foo_ovlv._Mls64Mls64_accdata +// CHECK-NEXT: ret ptr @_Z7foo_ovlv._Mls64Msme // CHECK: resolver_else: // CHECK-NEXT: ret ptr @_Z7foo_ovlv.default // @@ -114,16 +111,16 @@ void run_foo_tml() { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1236950581248 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1236950581248 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: // CHECK-NEXT: ret ptr @_ZN7MyClassIssE7foo_tmlEv._Msme-f64f64Mssbs // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 16777216 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 2097152 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 2097152 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: @@ -136,16 +133,16 @@ void run_foo_tml() { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1236950581248 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1236950581248 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: // CHECK-NEXT: ret ptr @_ZN7MyClassIisE7foo_tmlEv._Msme-f64f64Mssbs // CHECK: resolver_else: // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 16777216 -// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216 +// CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 2097152 +// CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 2097152 // CHECK-NEXT: [[TMP7:%.*]] = and i1 true, [[TMP6]] // CHECK-NEXT: br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]] // CHECK: resolver_return1: @@ -231,7 +228,7 @@ void run_foo_tml() { // //. // CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" } -// CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ls64" } +// CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" } // CHECK: attributes #[[ATTR2:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // CHECK: attributes #[[ATTR3:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint" } // CHECK: attributes #[[ATTR4:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" } diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp index 8b7273fe3bb517..24433b8f20b7d1 100644 --- a/clang/test/CodeGenCXX/attr-target-version.cpp +++ b/clang/test/CodeGenCXX/attr-target-version.cpp @@ -3,7 +3,7 @@ int __attribute__((target_version("sme-f64f64+bf16"))) foo(int) { return 1; } int __attribute__((target_version("default"))) foo(int) { return 2; } -int __attribute__((target_version("sm4+ebf16"))) foo(void) { return 3; } +int __attribute__((target_version("sm4+bf16"))) foo(void) { return 3; } int __attribute__((target_version("default"))) foo(void) { return 4; } struct MyClass { @@ -89,7 +89,7 @@ int bar() { // CHECK-NEXT: ret i32 2 // // -// CHECK-LABEL: @_Z3foov._Mebf16Msm4( +// CHECK-LABEL: @_Z3foov._Mbf16Msm4( // CHECK-NEXT: entry: // CHECK-NEXT: ret i32 3 // @@ -246,8 +246,8 @@ int bar() { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1099520016384 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1099520016384 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -260,12 +260,12 @@ int bar() { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 268435488 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 8388640 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 8388640 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: -// CHECK-NEXT: ret ptr @_Z3foov._Mebf16Msm4 +// CHECK-NEXT: ret ptr @_Z3foov._Mbf16Msm4 // CHECK: resolver_else: // CHECK-NEXT: ret ptr @_Z3foov.default // @@ -274,8 +274,8 @@ int bar() { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 33554432 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 33554432 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: @@ -288,8 +288,8 @@ int bar() { // CHECK-NEXT: resolver_entry: // CHECK-NEXT: call void @__init_cpu_features_resolver() // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8 -// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 65536 -// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 65536 +// CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 16384 +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 16384 // CHECK-NEXT: [[TMP3:%.*]] = and i1 true, [[TMP2]] // CHECK-NEXT: br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]] // CHECK: resolver_return: diff --git a/clang/test/Sema/aarch64-cpu-supports.c b/clang/test/Sema/aarch64-cpu-supports.c index ddeed7c5bc9e97..cf8e043dfb1e3d 100644 --- a/clang/test/Sema/aarch64-cpu-supports.c +++ b/clang/test/Sema/aarch64-cpu-supports.c @@ -20,7 +20,7 @@ int test_aarch64_features(void) { // expected-warning@+1 {{invalid cpu feature string}} if (__builtin_cpu_supports("default")) return 6; - if (__builtin_cpu_supports(" ssbs + bti ")) + if (__builtin_cpu_supports(" ssbs + crc ")) return 7; return 0; } diff --git a/clang/test/Sema/attr-target-clones-aarch64.c b/clang/test/Sema/attr-target-clones-aarch64.c index bc3fceab82825b..511e8a9411804d 100644 --- a/clang/test/Sema/attr-target-clones-aarch64.c +++ b/clang/test/Sema/attr-target-clones-aarch64.c @@ -9,7 +9,7 @@ void __attribute__((target_clones("ssbs+ls64"))) warn2(void); // expected-error@+2 {{'target_clones' and 'target_version' attributes are not compatible}} // expected-note@+1 {{conflicting attribute is here}} -void __attribute__((target_version("sve-bf16"), target_clones("sme+memtag"))) not_compat(void); +void __attribute__((target_version("bf16"), target_clones("sme+memtag"))) not_compat(void); int redecl(void); int __attribute__((target_clones("frintts", "simd+fp", "default"))) redecl(void) { return 1; } @@ -21,26 +21,25 @@ int __attribute__((target_clones("sve+dotprod"))) redecl3(void); int redecl3(void); int __attribute__((target_clones("rng", "fp16fml+fp", "default"))) redecl4(void); -// expected-error@+3 {{'target_clones' attribute does not match previous declaration}} +// expected-error@+2 {{'target_clones' attribute does not match previous declaration}} // expected-note@-2 {{previous declaration is here}} -// expected-warning@+1 {{version list contains entries that don't impact code generation}} -int __attribute__((target_clones("dgh+memtag+rpres+ls64_v", "ebf16+dpb+sha1", "default"))) redecl4(void) { return 1; } +int __attribute__((target_clones("dpb2+memtag+rpres+ls64", "bf16+dpb+sha2", "default"))) redecl4(void) { return 1; } int __attribute__((target_version("flagm2"))) redef2(void) { return 1; } // expected-error@+2 {{multiversioned function redeclarations require identical target attributes}} // expected-note@-2 {{previous declaration is here}} int __attribute__((target_clones("flagm2", "default"))) redef2(void) { return 1; } -int __attribute__((target_clones("f32mm", "f64mm", "sha1+fp"))) redef3(void) { return 1; } +int __attribute__((target_clones("f32mm", "f64mm", "sha3+fp"))) redef3(void) { return 1; } // expected-error@+2 {{'target_clones' attribute does not match previous declaration}} // expected-note@-2 {{previous declaration is here}} -int __attribute__((target_clones("f32mm", "sha1+fp", "f64mm"))) redef3(void) { return 1; } +int __attribute__((target_clones("f32mm", "sha3+fp", "f64mm"))) redef3(void) { return 1; } int __attribute__((target_clones("rdm+lse+rdm", "lse+rdm"))) dup1(void) { return 1; } // expected-warning@+1 {{version list contains duplicate entries}} int __attribute__((target_clones("rdm+lse+rdm", "rdm+lse+rdm"))) dup2(void) { return 2; } // expected-warning@+1 {{version list contains duplicate entries}} -int __attribute__((target_clones("rcpc2+sve2-pmull128", "rcpc2+sve2-pmull128"))) dup3(void) { return 3; } +int __attribute__((target_clones("rcpc2+sve2-aes", "rcpc2+sve2-aes"))) dup3(void) { return 3; } // expected-warning@+1 {{version list contains duplicate entries}} void __attribute__((target_clones("sha3", "default", "default"))) dup4(void); // expected-warning@+2 {{version list contains duplicate entries}} @@ -49,7 +48,7 @@ int __attribute__((target_clones("fp", "fp", "crc+dotprod", "dotprod+crc"))) dup // expected-warning@+1 {{version list contains duplicate entries}} int __attribute__((target_clones("fp16+memtag", "memtag+fp16"))) dup6(void) { return 6; } -int __attribute__((target_clones("simd+ssbs2", "simd+dpb2"))) dup7(void) { return 7; } +int __attribute__((target_clones("simd+ssbs", "simd+dpb2"))) dup7(void) { return 7; } // expected-warning@+1 {{unsupported '' in the 'target_clones' attribute string;}} void __attribute__((target_clones(""))) empty_target_1(void); @@ -72,13 +71,13 @@ empty_target_5(void); void __attribute__((target_clones("sve2-bitperm", "sve2-bitperm"))) dupe_normal(void); -void __attribute__((target_clones("default"), target_clones("memtag3+bti"))) dupe_normal2(void); +void __attribute__((target_clones("default"), target_clones("memtag+mops"))) dupe_normal2(void); int mv_after_use(void); int useage(void) { return mv_after_use(); } // expected-error@+1 {{function declaration cannot become a multiversioned function after first usage}} -int __attribute__((target_clones("sve2-sha3+ssbs2", "sm4"))) mv_after_use(void) { return 1; } +int __attribute__((target_clones("sve2-sha3+ssbs", "sm4"))) mv_after_use(void) { return 1; } // expected-error@+1 {{'main' cannot be a multiversioned function}} -int __attribute__((target_clones("sve-i8mm"))) main() { return 1; } +int __attribute__((target_clones("sve2-aes"))) main() { return 1; } diff --git a/clang/test/Sema/attr-target-version.c b/clang/test/Sema/attr-target-version.c index cd5be459456eb7..4ff8e1eee619e8 100644 --- a/clang/test/Sema/attr-target-version.c +++ b/clang/test/Sema/attr-target-version.c @@ -16,7 +16,7 @@ int __attribute__((target_version("aes"))) foo(void) { return 1; } int __attribute__((target_version("default"))) foo(void) { return 2; } //expected-note@+1 {{previous definition is here}} -int __attribute__((target_version("sha3 + pmull "))) foo(void) { return 1; } +int __attribute__((target_version("sha3 + aes "))) foo(void) { return 1; } //expected-note@-1 {{previous definition is here}} //expected-error@+1 {{redefinition of 'foo'}} @@ -32,11 +32,11 @@ __attribute__ ((target("bf16,sve,sve2,dotprod"))) int func(void) { return 1; } __attribute__ ((target("default"))) int func(void) { return 0; } //expected-note@+1 {{previous declaration is here}} -void __attribute__((target_version("bti+flagm2"))) one(void) {} +void __attribute__((target_version("mops+flagm2"))) one(void) {} //expected-error@+1 {{multiversioned function redeclarations require identical target attributes}} -void __attribute__((target_version("flagm2+bti"))) one(void) {} +void __attribute__((target_version("flagm2+mops"))) one(void) {} -void __attribute__((target_version("ssbs+sha1"))) two(void) {} +void __attribute__((target_version("ssbs+sha2"))) two(void) {} void __attribute__((target_version("ssbs+fp16fml"))) two(void) {} //expected-error@+1 {{'main' cannot be a multiversioned function}} @@ -44,7 +44,7 @@ int __attribute__((target_version("lse"))) main(void) { return 1; } // It is ok for the default version to appear first. int default_first(void) { return 1; } -int __attribute__((target_version("dit"))) default_first(void) { return 2; } +int __attribute__((target_version("crc"))) default_first(void) { return 2; } int __attribute__((target_version("mops"))) default_first(void) { return 3; } // It is ok if the default version is between other versions. @@ -77,7 +77,7 @@ void __attribute__((target_version("rdm+rng+crc"))) redef(void) {} void __attribute__((target_version("rdm+rng+crc"))) redef(void) {} int def(void); -void __attribute__((target_version("dit"))) nodef(void); +void __attribute__((target_version("crc"))) nodef(void); void __attribute__((target_version("ls64"))) nodef(void); void __attribute__((target_version("aes"))) ovl(void); void __attribute__((target_version("default"))) ovl(void); @@ -89,12 +89,12 @@ int bar() { return def(); } // expected-error@+1 {{function declaration cannot become a multiversioned function after first usage}} -int __attribute__((target_version("sha1"))) def(void) { return 1; } +int __attribute__((target_version("sha3"))) def(void) { return 1; } int __attribute__((target_version("sve"))) prot(); // expected-error@-1 {{multiversioned function must have a prototype}} -int __attribute__((target_version("pmull"))) rtype(int); +int __attribute__((target_version("aes"))) rtype(int); // expected-error@+1 {{multiversioned function declaration has a different return type}} float __attribute__((target_version("rdm"))) rtype(int); @@ -102,7 +102,7 @@ int __attribute__((target_version("sha2"))) combine(void) { return 1; } // expected-error@+1 {{multiversioned function declaration has a different calling convention}} int __attribute__((aarch64_vector_pcs, target_version("sha3"))) combine(void) { return 2; } -int __attribute__((target_version("fp+aes+pmull+rcpc"))) unspec_args() { return -1; } +int __attribute__((target_version("fp+aes+sha3+rcpc"))) unspec_args() { return -1; } // expected-error@-1 {{multiversioned function must have a prototype}} // expected-error@+1 {{multiversioned function must have a prototype}} int __attribute__((target_version("default"))) unspec_args() { return 0; } diff --git a/clang/test/SemaCXX/attr-target-version.cpp b/clang/test/SemaCXX/attr-target-version.cpp index b3385f043590f8..ae2923cc303c2e 100644 --- a/clang/test/SemaCXX/attr-target-version.cpp +++ b/clang/test/SemaCXX/attr-target-version.cpp @@ -25,13 +25,13 @@ int __attribute__((target_version("dpb"))) diff_link(void); int __attribute__((target_version("memtag"))) diff_link1(void) { return 1; } //expected-error@+1 {{multiversioned function declaration has a different linkage}} -static int __attribute__((target_version("bti"))) diff_link1(void); +static int __attribute__((target_version("mops"))) diff_link1(void); int __attribute__((target_version("flagm2"))) diff_link2(void) { return 1; } extern int __attribute__((target_version("flagm"))) diff_link2(void); namespace { -static int __attribute__((target_version("memtag3"))) diff_link2(void) { return 2; } +static int __attribute__((target_version("memtag"))) diff_link2(void) { return 2; } int __attribute__((target_version("sve2-bitperm"))) diff_link2(void) { return 1; } } // namespace @@ -49,7 +49,7 @@ double __attribute__((target_version("rcpc"))) diff_type1(void); auto __attribute__((target_version("rcpc2"))) diff_type2(void) -> int { return 1; } //expected-error@+1 {{multiversioned function declaration has a different return type}} -auto __attribute__((target_version("sve-bf16"))) diff_type2(void) -> long { return (long)1; } +auto __attribute__((target_version("sve2-aes"))) diff_type2(void) -> long { return (long)1; } int __attribute__((target_version("fp16fml"))) diff_type3(void) noexcept(false) { return 1; } //expected-error@+2 {{exception specification in declaration does not match previous declaration}} @@ -75,7 +75,7 @@ auto __attribute__((target_version("dpb2"))) ret3(void) -> int { return 1; } class Cls { __attribute__((target_version("rng"))) Cls(); // expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support constructors}} - __attribute__((target_version("sve-i8mm"))) ~Cls(); + __attribute__((target_version("sve2-aes"))) ~Cls(); // expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support destructors}} Cls &__attribute__((target_version("f32mm"))) operator=(const Cls &) = default; @@ -98,11 +98,11 @@ __attribute__((target_version("jscvt"))) void Decl(); } // namespace Nms class Out { - int __attribute__((target_version("bti"))) func(void); - int __attribute__((target_version("ssbs2"))) func(void); + int __attribute__((target_version("mops"))) func(void); + int __attribute__((target_version("ssbs"))) func(void); }; -int __attribute__((target_version("bti"))) Out::func(void) { return 1; } -int __attribute__((target_version("ssbs2"))) Out::func(void) { return 2; } +int __attribute__((target_version("mops"))) Out::func(void) { return 1; } +int __attribute__((target_version("ssbs"))) Out::func(void) { return 2; } // expected-error@+3 {{out-of-line definition of 'func' does not match any declaration in 'Out'}} // expected-note@-3 {{member declaration nearly matches}} // expected-note@-3 {{member declaration nearly matches}} diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c index 17bddfca46f094..1ac4d85a0c139c 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64.c +++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c @@ -67,13 +67,10 @@ enum CPUFeatures { FEAT_FP, FEAT_SIMD, FEAT_CRC, - FEAT_SHA1, FEAT_SHA2, FEAT_SHA3, FEAT_AES, - FEAT_PMULL, FEAT_FP16, - FEAT_DIT, FEAT_DPB, FEAT_DPB2, FEAT_JSCVT, @@ -81,35 +78,23 @@ enum CPUFeatures { FEAT_RCPC, FEAT_RCPC2, FEAT_FRINTTS, - FEAT_DGH, FEAT_I8MM, FEAT_BF16, - FEAT_EBF16, FEAT_RPRES, FEAT_SVE, - FEAT_SVE_BF16, - FEAT_SVE_EBF16, - FEAT_SVE_I8MM, FEAT_SVE_F32MM, FEAT_SVE_F64MM, FEAT_SVE2, FEAT_SVE_AES, - FEAT_SVE_PMULL128, FEAT_SVE_BITPERM, FEAT_SVE_SHA3, FEAT_SVE_SM4, FEAT_SME, FEAT_MEMTAG, - FEAT_MEMTAG2, - FEAT_MEMTAG3, FEAT_SB, FEAT_PREDRES, FEAT_SSBS, - FEAT_SSBS2, - FEAT_BTI, FEAT_LS64, - FEAT_LS64_V, - FEAT_LS64_ACCDATA, FEAT_WFXT, FEAT_SME_F64, FEAT_SME_I64, @@ -117,7 +102,7 @@ enum CPUFeatures { FEAT_RCPC3, FEAT_MOPS, FEAT_MAX, - FEAT_EXT = 62, // Reserved to indicate presence of additional features field + FEAT_EXT = 47, // Reserved to indicate presence of additional features field // in __aarch64_cpu_features FEAT_INIT // Used as flag of features initialization completion }; diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc index 6fef109567b613..19114c4abdfca2 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc @@ -35,13 +35,10 @@ void __init_cpu_features_resolver(void) { {"hw.optional.floatingpoint", FEAT_FP}, {"hw.optional.AdvSIMD", FEAT_SIMD}, {"hw.optional.armv8_crc32", FEAT_CRC}, - {"hw.optional.arm.FEAT_SHA1", FEAT_SHA1}, {"hw.optional.arm.FEAT_SHA256", FEAT_SHA2}, {"hw.optional.arm.FEAT_SHA3", FEAT_SHA3}, {"hw.optional.arm.FEAT_AES", FEAT_AES}, - {"hw.optional.arm.FEAT_PMULL", FEAT_PMULL}, {"hw.optional.arm.FEAT_FP16", FEAT_FP16}, - {"hw.optional.arm.FEAT_DIT", FEAT_DIT}, {"hw.optional.arm.FEAT_DPB", FEAT_DPB}, {"hw.optional.arm.FEAT_DPB2", FEAT_DPB2}, {"hw.optional.arm.FEAT_JSCVT", FEAT_JSCVT}, @@ -53,8 +50,7 @@ void __init_cpu_features_resolver(void) { {"hw.optional.arm.FEAT_BF16", FEAT_BF16}, {"hw.optional.arm.FEAT_SB", FEAT_SB}, {"hw.optional.arm.FEAT_SPECRES", FEAT_PREDRES}, - {"hw.optional.arm.FEAT_SSBS", FEAT_SSBS2}, - {"hw.optional.arm.FEAT_BTI", FEAT_BTI}, + {"hw.optional.arm.FEAT_SSBS", FEAT_SSBS}, }; for (size_t I = 0, E = sizeof(feature_checks) / sizeof(feature_checks[0]); diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc index d8e0280f404167..579aa00dc5c72b 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc @@ -22,10 +22,6 @@ void __init_cpu_features_resolver() { setCPUFeature(FEAT_SIMD); if (features & ZX_ARM64_FEATURE_ISA_AES) setCPUFeature(FEAT_AES); - if (features & ZX_ARM64_FEATURE_ISA_PMULL) - setCPUFeature(FEAT_PMULL); - if (features & ZX_ARM64_FEATURE_ISA_SHA1) - setCPUFeature(FEAT_SHA1); if (features & ZX_ARM64_FEATURE_ISA_SHA256) setCPUFeature(FEAT_SHA2); if (features & ZX_ARM64_FEATURE_ISA_CRC32) diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc index 32a21a2fba9a31..54797852e4aaa3 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc +++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc @@ -16,8 +16,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap, hwcap2 = arg->_hwcap2; if (hwcap & HWCAP_CRC32) setCPUFeature(FEAT_CRC); - if (hwcap & HWCAP_PMULL) - setCPUFeature(FEAT_PMULL); if (hwcap & HWCAP_FLAGM) setCPUFeature(FEAT_FLAGM); if (hwcap2 & HWCAP2_FLAGM2) { @@ -34,16 +32,12 @@ static void __init_cpu_features_constructor(unsigned long hwcap, setCPUFeature(FEAT_FP16); setCPUFeature(FEAT_FP); } - if (hwcap & HWCAP_DIT) - setCPUFeature(FEAT_DIT); if (hwcap & HWCAP_ASIMDRDM) setCPUFeature(FEAT_RDM); if (hwcap & HWCAP_ILRCPC) setCPUFeature(FEAT_RCPC2); if (hwcap & HWCAP_AES) setCPUFeature(FEAT_AES); - if (hwcap & HWCAP_SHA1) - setCPUFeature(FEAT_SHA1); if (hwcap & HWCAP_SHA2) setCPUFeature(FEAT_SHA2); if (hwcap & HWCAP_JSCVT) @@ -53,22 +47,11 @@ static void __init_cpu_features_constructor(unsigned long hwcap, if (hwcap & HWCAP_SB) setCPUFeature(FEAT_SB); if (hwcap & HWCAP_SSBS) - setCPUFeature(FEAT_SSBS2); - if (hwcap2 & HWCAP2_MTE) { + setCPUFeature(FEAT_SSBS); + if (hwcap2 & HWCAP2_MTE) setCPUFeature(FEAT_MEMTAG); - setCPUFeature(FEAT_MEMTAG2); - } - if (hwcap2 & HWCAP2_MTE3) { - setCPUFeature(FEAT_MEMTAG); - setCPUFeature(FEAT_MEMTAG2); - setCPUFeature(FEAT_MEMTAG3); - } if (hwcap2 & HWCAP2_SVEAES) setCPUFeature(FEAT_SVE_AES); - if (hwcap2 & HWCAP2_SVEPMULL) { - setCPUFeature(FEAT_SVE_AES); - setCPUFeature(FEAT_SVE_PMULL128); - } if (hwcap2 & HWCAP2_SVEBITPERM) setCPUFeature(FEAT_SVE_BITPERM); if (hwcap2 & HWCAP2_SVESHA3) @@ -83,22 +66,12 @@ static void __init_cpu_features_constructor(unsigned long hwcap, setCPUFeature(FEAT_RNG); if (hwcap2 & HWCAP2_I8MM) setCPUFeature(FEAT_I8MM); - if (hwcap2 & HWCAP2_EBF16) - setCPUFeature(FEAT_EBF16); - if (hwcap2 & HWCAP2_SVE_EBF16) - setCPUFeature(FEAT_SVE_EBF16); - if (hwcap2 & HWCAP2_DGH) - setCPUFeature(FEAT_DGH); if (hwcap2 & HWCAP2_FRINT) setCPUFeature(FEAT_FRINTTS); - if (hwcap2 & HWCAP2_SVEI8MM) - setCPUFeature(FEAT_SVE_I8MM); if (hwcap2 & HWCAP2_SVEF32MM) setCPUFeature(FEAT_SVE_F32MM); if (hwcap2 & HWCAP2_SVEF64MM) setCPUFeature(FEAT_SVE_F64MM); - if (hwcap2 & HWCAP2_BTI) - setCPUFeature(FEAT_BTI); if (hwcap2 & HWCAP2_RPRES) setCPUFeature(FEAT_RPRES); if (hwcap2 & HWCAP2_WFXT) @@ -141,9 +114,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap, // ID_AA64ZFR0_EL1.SVEver == 0b0001 if (extractBits(ftr, 0, 4) == 0x1) setCPUFeature(FEAT_SVE2); - // ID_AA64ZFR0_EL1.BF16 != 0b0000 - if (extractBits(ftr, 20, 4) != 0x0) - setCPUFeature(FEAT_SVE_BF16); } getCPUFeature(ID_AA64ISAR0_EL1, ftr); // ID_AA64ISAR0_EL1.SHA3 != 0b0000 @@ -168,12 +138,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap, // ID_AA64ISAR1_EL1.LS64 >= 0b0001 if (extractBits(ftr, 60, 4) >= 0x1) setCPUFeature(FEAT_LS64); - // ID_AA64ISAR1_EL1.LS64 >= 0b0010 - if (extractBits(ftr, 60, 4) >= 0x2) - setCPUFeature(FEAT_LS64_V); - // ID_AA64ISAR1_EL1.LS64 >= 0b0011 - if (extractBits(ftr, 60, 4) >= 0x3) - setCPUFeature(FEAT_LS64_ACCDATA); } else { // Set some features in case of no CPUID support if (hwcap & (HWCAP_FP | HWCAP_FPHP)) { @@ -187,8 +151,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap, setCPUFeature(FEAT_RCPC); if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16) setCPUFeature(FEAT_BF16); - if (hwcap2 & HWCAP2_SVEBF16) - setCPUFeature(FEAT_SVE_BF16); if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE) setCPUFeature(FEAT_SVE2); if (hwcap & HWCAP_SHA3) diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 805b963a7a13c7..bcb36beb3cc0b9 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -46,13 +46,10 @@ enum CPUFeatures { FEAT_FP, FEAT_SIMD, FEAT_CRC, - FEAT_SHA1, FEAT_SHA2, FEAT_SHA3, FEAT_AES, - FEAT_PMULL, FEAT_FP16, - FEAT_DIT, FEAT_DPB, FEAT_DPB2, FEAT_JSCVT, @@ -60,35 +57,23 @@ enum CPUFeatures { FEAT_RCPC, FEAT_RCPC2, FEAT_FRINTTS, - FEAT_DGH, FEAT_I8MM, FEAT_BF16, - FEAT_EBF16, FEAT_RPRES, FEAT_SVE, - FEAT_SVE_BF16, - FEAT_SVE_EBF16, - FEAT_SVE_I8MM, FEAT_SVE_F32MM, FEAT_SVE_F64MM, FEAT_SVE2, FEAT_SVE_AES, - FEAT_SVE_PMULL128, FEAT_SVE_BITPERM, FEAT_SVE_SHA3, FEAT_SVE_SM4, FEAT_SME, FEAT_MEMTAG, - FEAT_MEMTAG2, - FEAT_MEMTAG3, FEAT_SB, FEAT_PREDRES, FEAT_SSBS, - FEAT_SSBS2, - FEAT_BTI, FEAT_LS64, - FEAT_LS64_V, - FEAT_LS64_ACCDATA, FEAT_WFXT, FEAT_SME_F64, FEAT_SME_I64, @@ -96,12 +81,12 @@ enum CPUFeatures { FEAT_RCPC3, FEAT_MOPS, FEAT_MAX, - FEAT_EXT = 62, + FEAT_EXT = 47, FEAT_INIT }; -static_assert(FEAT_MAX < 62, - "Number of features in CPUFeatures are limited to 62 entries"); +static_assert(FEAT_MAX < 47, + "Number of features in CPUFeatures are limited to 47 entries"); // Arch extension modifiers for CPUs. These are labelled with their Arm ARM // feature name (though the canonical reference for those is AArch64.td) @@ -215,17 +200,13 @@ inline constexpr ExtensionInfo Extensions[] = { {"b16b16", AArch64::AEK_B16B16, "+b16b16", "-b16b16", FEAT_INIT, "", 0}, {"bf16", AArch64::AEK_BF16, "+bf16", "-bf16", FEAT_BF16, "+bf16", 280}, {"brbe", AArch64::AEK_BRBE, "+brbe", "-brbe", FEAT_INIT, "", 0}, - {"bti", AArch64::AEK_NONE, {}, {}, FEAT_BTI, "+bti", 510}, {"crc", AArch64::AEK_CRC, "+crc", "-crc", FEAT_CRC, "+crc", 110}, {"crypto", AArch64::AEK_CRYPTO, "+crypto", "-crypto", FEAT_INIT, "+aes,+sha2", 0}, {"cssc", AArch64::AEK_CSSC, "+cssc", "-cssc", FEAT_INIT, "", 0}, {"d128", AArch64::AEK_D128, "+d128", "-d128", FEAT_INIT, "", 0}, - {"dgh", AArch64::AEK_NONE, {}, {}, FEAT_DGH, "", 260}, - {"dit", AArch64::AEK_NONE, {}, {}, FEAT_DIT, "+dit", 180}, {"dotprod", AArch64::AEK_DOTPROD, "+dotprod", "-dotprod", FEAT_DOTPROD, "+dotprod,+fp-armv8,+neon", 104}, {"dpb", AArch64::AEK_NONE, {}, {}, FEAT_DPB, "+ccpp", 190}, {"dpb2", AArch64::AEK_NONE, {}, {}, FEAT_DPB2, "+ccpp,+ccdp", 200}, - {"ebf16", AArch64::AEK_NONE, {}, {}, FEAT_EBF16, "+bf16", 290}, {"f32mm", AArch64::AEK_F32MM, "+f32mm", "-f32mm", FEAT_SVE_F32MM, "+sve,+f32mm,+fullfp16,+fp-armv8,+neon", 350}, {"f64mm", AArch64::AEK_F64MM, "+f64mm", "-f64mm", FEAT_SVE_F64MM, "+sve,+f64mm,+fullfp16,+fp-armv8,+neon", 360}, {"fcma", AArch64::AEK_FCMA, "+complxnum", "-complxnum", FEAT_FCMA, "+fp-armv8,+neon,+complxnum", 220}, @@ -239,17 +220,12 @@ inline constexpr ExtensionInfo Extensions[] = { {"i8mm", AArch64::AEK_I8MM, "+i8mm", "-i8mm", FEAT_I8MM, "+i8mm", 270}, {"ite", AArch64::AEK_ITE, "+ite", "-ite", FEAT_INIT, "", 0}, {"jscvt", AArch64::AEK_JSCVT, "+jsconv", "-jsconv", FEAT_JSCVT, "+fp-armv8,+neon,+jsconv", 210}, - {"ls64_accdata", AArch64::AEK_NONE, {}, {}, FEAT_LS64_ACCDATA, "+ls64", 540}, - {"ls64_v", AArch64::AEK_NONE, {}, {}, FEAT_LS64_V, "", 530}, {"ls64", AArch64::AEK_LS64, "+ls64", "-ls64", FEAT_LS64, "", 520}, {"lse", AArch64::AEK_LSE, "+lse", "-lse", FEAT_LSE, "+lse", 80}, {"lse128", AArch64::AEK_LSE128, "+lse128", "-lse128", FEAT_INIT, "", 0}, {"memtag", AArch64::AEK_MTE, "+mte", "-mte", FEAT_MEMTAG, "", 440}, - {"memtag2", AArch64::AEK_NONE, {}, {}, FEAT_MEMTAG2, "+mte", 450}, - {"memtag3", AArch64::AEK_NONE, {}, {}, FEAT_MEMTAG3, "+mte", 460}, {"mops", AArch64::AEK_MOPS, "+mops", "-mops", FEAT_MOPS, "+mops", 650}, {"pauth", AArch64::AEK_PAUTH, "+pauth", "-pauth", FEAT_INIT, "", 0}, - {"pmull", AArch64::AEK_NONE, {}, {}, FEAT_PMULL, "+aes,+fp-armv8,+neon", 160}, {"pmuv3", AArch64::AEK_PERFMON, "+perfmon", "-perfmon", FEAT_INIT, "", 0}, {"predres", AArch64::AEK_PREDRES, "+predres", "-predres", FEAT_PREDRES, "+predres", 480}, {"predres2", AArch64::AEK_SPECRES2, "+specres2", "-specres2", FEAT_INIT, "", 0}, @@ -263,7 +239,6 @@ inline constexpr ExtensionInfo Extensions[] = { {"rng", AArch64::AEK_RAND, "+rand", "-rand", FEAT_RNG, "+rand", 10}, {"rpres", AArch64::AEK_NONE, {}, {}, FEAT_RPRES, "", 300}, {"sb", AArch64::AEK_SB, "+sb", "-sb", FEAT_SB, "+sb", 470}, - {"sha1", AArch64::AEK_NONE, {}, {}, FEAT_SHA1, "+fp-armv8,+neon", 120}, {"sha2", AArch64::AEK_SHA2, "+sha2", "-sha2", FEAT_SHA2, "+sha2,+fp-armv8,+neon", 130}, {"sha3", AArch64::AEK_SHA3, "+sha3", "-sha3", FEAT_SHA3, "+sha3,+sha2,+fp-armv8,+neon", 140}, {"simd", AArch64::AEK_SIMD, "+neon", "-neon", FEAT_SIMD, "+fp-armv8,+neon", 100}, @@ -275,14 +250,9 @@ inline constexpr ExtensionInfo Extensions[] = { {"sme2", AArch64::AEK_SME2, "+sme2", "-sme2", FEAT_SME2, "+sme2,+sme,+bf16", 580}, {"sme2p1", AArch64::AEK_SME2p1, "+sme2p1", "-sme2p1", FEAT_INIT, "+sme2p1,+sme2,+sme,+bf16", 0}, {"ssbs", AArch64::AEK_SSBS, "+ssbs", "-ssbs", FEAT_SSBS, "", 490}, - {"ssbs2", AArch64::AEK_NONE, {}, {}, FEAT_SSBS2, "+ssbs", 500}, - {"sve-bf16", AArch64::AEK_NONE, {}, {}, FEAT_SVE_BF16, "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 320}, - {"sve-ebf16", AArch64::AEK_NONE, {}, {}, FEAT_SVE_EBF16, "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 330}, - {"sve-i8mm", AArch64::AEK_NONE, {}, {}, FEAT_SVE_I8MM, "+sve,+i8mm,+fullfp16,+fp-armv8,+neon", 340}, {"sve", AArch64::AEK_SVE, "+sve", "-sve", FEAT_SVE, "+sve,+fullfp16,+fp-armv8,+neon", 310}, {"sve2-aes", AArch64::AEK_SVE2AES, "+sve2-aes", "-sve2-aes", FEAT_SVE_AES, "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 380}, {"sve2-bitperm", AArch64::AEK_SVE2BITPERM, "+sve2-bitperm", "-sve2-bitperm", FEAT_SVE_BITPERM, "+sve2,+sve,+sve2-bitperm,+fullfp16,+fp-armv8,+neon", 400}, - {"sve2-pmull128", AArch64::AEK_NONE, {}, {}, FEAT_SVE_PMULL128, "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 390}, {"sve2-sha3", AArch64::AEK_SVE2SHA3, "+sve2-sha3", "-sve2-sha3", FEAT_SVE_SHA3, "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410}, {"sve2-sm4", AArch64::AEK_SVE2SM4, "+sve2-sm4", "-sve2-sm4", FEAT_SVE_SM4, "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420}, {"sve2", AArch64::AEK_SVE2, "+sve2", "-sve2", FEAT_SVE2, "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370}, From 06947b9f8d258fe66fc69f1e7c0197cb621da3a5 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 17 Apr 2024 09:31:29 -0700 Subject: [PATCH 275/300] [libc][POSIX][pthreads] implement pthread_condattr_t functions (#88987) Implement: - pthread_condattr_destroy - pthread_condattr_getclock - pthread_condattr_getpshared - pthread_condattr_init - pthread_condattr_setclock - pthread_condattr_setpshared Fixes: #88581 --- libc/config/linux/api.td | 29 ++++++-- libc/config/linux/x86_64/entrypoints.txt | 6 ++ libc/include/CMakeLists.txt | 1 + libc/include/llvm-libc-types/CMakeLists.txt | 3 +- .../llvm-libc-types/pthread_condattr_t.h | 18 +++++ libc/include/pthread.h.def | 6 ++ libc/spec/posix.td | 64 ++++++++++++++++- libc/src/pthread/CMakeLists.txt | 65 +++++++++++++++++ libc/src/pthread/pthread_condattr_destroy.cpp | 23 ++++++ libc/src/pthread/pthread_condattr_destroy.h | 20 ++++++ .../src/pthread/pthread_condattr_getclock.cpp | 25 +++++++ libc/src/pthread/pthread_condattr_getclock.h | 22 ++++++ .../pthread/pthread_condattr_getpshared.cpp | 24 +++++++ .../src/pthread/pthread_condattr_getpshared.h | 21 ++++++ libc/src/pthread/pthread_condattr_init.cpp | 24 +++++++ libc/src/pthread/pthread_condattr_init.h | 20 ++++++ .../src/pthread/pthread_condattr_setclock.cpp | 30 ++++++++ libc/src/pthread/pthread_condattr_setclock.h | 21 ++++++ .../pthread/pthread_condattr_setpshared.cpp | 28 ++++++++ .../src/pthread/pthread_condattr_setpshared.h | 20 ++++++ libc/test/src/pthread/CMakeLists.txt | 12 ++++ .../src/pthread/pthread_condattr_test.cpp | 71 +++++++++++++++++++ 22 files changed, 545 insertions(+), 8 deletions(-) create mode 100644 libc/include/llvm-libc-types/pthread_condattr_t.h create mode 100644 libc/src/pthread/pthread_condattr_destroy.cpp create mode 100644 libc/src/pthread/pthread_condattr_destroy.h create mode 100644 libc/src/pthread/pthread_condattr_getclock.cpp create mode 100644 libc/src/pthread/pthread_condattr_getclock.h create mode 100644 libc/src/pthread/pthread_condattr_getpshared.cpp create mode 100644 libc/src/pthread/pthread_condattr_getpshared.h create mode 100644 libc/src/pthread/pthread_condattr_init.cpp create mode 100644 libc/src/pthread/pthread_condattr_init.h create mode 100644 libc/src/pthread/pthread_condattr_setclock.cpp create mode 100644 libc/src/pthread/pthread_condattr_setclock.h create mode 100644 libc/src/pthread/pthread_condattr_setpshared.cpp create mode 100644 libc/src/pthread/pthread_condattr_setpshared.h create mode 100644 libc/test/src/pthread/pthread_condattr_test.cpp diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td index 9964971f191b75..5fb92a9c299cc3 100644 --- a/libc/config/linux/api.td +++ b/libc/config/linux/api.td @@ -175,6 +175,7 @@ def PThreadAPI : PublicAPI<"pthread.h"> { "__pthread_start_t", "__pthread_tss_dtor_t", "pthread_attr_t", + "pthread_condattr_t", "pthread_mutex_t", "pthread_mutexattr_t", "pthread_t", @@ -241,10 +242,30 @@ def SysSendfileAPI : PublicAPI<"sys/sendfile.h"> { } def SysTypesAPI : PublicAPI<"sys/types.h"> { - let Types = ["blkcnt_t", "blksize_t", "clockid_t", "dev_t", "gid_t", "ino_t", - "mode_t", "nlink_t", "off_t", "pid_t", "pthread_attr_t", "pthread_key_t", - "pthread_mutex_t", "pthread_mutexattr_t", "pthread_once_t", "pthread_t", - "size_t", "ssize_t", "suseconds_t", "time_t", "uid_t"]; + let Types = [ + "blkcnt_t", + "blksize_t", + "clockid_t", + "dev_t", + "gid_t", + "ino_t", + "mode_t", + "nlink_t", + "off_t", + "pid_t", + "pthread_attr_t", + "pthread_condattr_t", + "pthread_key_t", + "pthread_mutex_t", + "pthread_mutexattr_t", + "pthread_once_t", + "pthread_t", + "size_t", + "ssize_t", + "suseconds_t", + "time_t", + "uid_t" + ]; } def SysUtsNameAPI : PublicAPI<"sys/utsname.h"> { diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 70f130a4399a36..2d8136536b218b 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -639,6 +639,12 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.pthread.pthread_attr_setguardsize libc.src.pthread.pthread_attr_setstack libc.src.pthread.pthread_attr_setstacksize + libc.src.pthread.pthread_condattr_destroy + libc.src.pthread.pthread_condattr_getclock + libc.src.pthread.pthread_condattr_getpshared + libc.src.pthread.pthread_condattr_init + libc.src.pthread.pthread_condattr_setclock + libc.src.pthread.pthread_condattr_setpshared libc.src.pthread.pthread_create libc.src.pthread.pthread_detach libc.src.pthread.pthread_equal diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index b85366c8deafe0..f5ba2791af3fb8 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -321,6 +321,7 @@ add_gen_header( .llvm-libc-types.__pthread_start_t .llvm-libc-types.__pthread_tss_dtor_t .llvm-libc-types.pthread_attr_t + .llvm-libc-types.pthread_condattr_t .llvm-libc-types.pthread_mutex_t .llvm-libc-types.pthread_mutexattr_t .llvm-libc-types.pthread_t diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt index 93a79e1477b337..f26fc0729dc94c 100644 --- a/libc/include/llvm-libc-types/CMakeLists.txt +++ b/libc/include/llvm-libc-types/CMakeLists.txt @@ -49,11 +49,12 @@ add_header(pid_t HDR pid_t.h) add_header(posix_spawn_file_actions_t HDR posix_spawn_file_actions_t.h) add_header(posix_spawnattr_t HDR posix_spawnattr_t.h) add_header(pthread_attr_t HDR pthread_attr_t.h DEPENDS .size_t) +add_header(pthread_condattr_t HDR pthread_condattr_t.h DEPENDS .clockid_t) add_header(pthread_key_t HDR pthread_key_t.h) add_header(pthread_mutex_t HDR pthread_mutex_t.h DEPENDS .__futex_word .__mutex_type) -add_header(pthread_t HDR pthread_t.h DEPENDS .__thread_type) add_header(pthread_mutexattr_t HDR pthread_mutexattr_t.h) add_header(pthread_once_t HDR pthread_once_t.h DEPENDS .__futex_word) +add_header(pthread_t HDR pthread_t.h DEPENDS .__thread_type) add_header(rlim_t HDR rlim_t.h) add_header(time_t HDR time_t.h) add_header(stack_t HDR stack_t.h) diff --git a/libc/include/llvm-libc-types/pthread_condattr_t.h b/libc/include/llvm-libc-types/pthread_condattr_t.h new file mode 100644 index 00000000000000..b91fc2950aa3f2 --- /dev/null +++ b/libc/include/llvm-libc-types/pthread_condattr_t.h @@ -0,0 +1,18 @@ +//===-- Definition of pthread_condattr_t type -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_TYPES_PTHREAD_CONDATTR_T_H +#define LLVM_LIBC_TYPES_PTHREAD_CONDATTR_T_H + +#include "clockid_t.h" + +typedef struct { + clockid_t clock; + int pshared; +} pthread_condattr_t; + +#endif // LLVM_LIBC_TYPES_PTHREAD_CONDATTR_T_H diff --git a/libc/include/pthread.h.def b/libc/include/pthread.h.def index abeb839ee83d16..a94d770657e100 100644 --- a/libc/include/pthread.h.def +++ b/libc/include/pthread.h.def @@ -11,6 +11,9 @@ #include "__llvm-libc-common.h" +// TODO: move to a pthreads-macros.h file: +// https://github.com/llvm/llvm-project/issues/88997 + #define PTHREAD_STACK_MIN (1 << 14) // 16KB #define PTHREAD_MUTEX_INITIALIZER {0} @@ -32,6 +35,9 @@ enum { PTHREAD_MUTEX_ROBUST = 0x1, }; +#define PTHREAD_PROCESS_PRIVATE 0 +#define PTHREAD_PROCESS_SHARED 1 + %%public_api() #endif // LLVM_LIBC_PTHREAD_H diff --git a/libc/spec/posix.td b/libc/spec/posix.td index 7095a3964ee3fb..0c88dbd848a3fb 100644 --- a/libc/spec/posix.td +++ b/libc/spec/posix.td @@ -26,6 +26,7 @@ def UidT : NamedType<"uid_t">; def GidT : NamedType<"gid_t">; def DevT : NamedType<"dev_t">; def ClockIdT : NamedType<"clockid_t">; +def RestrictedClockIdTPtr : RestrictedPtrType; def BlkSizeT : NamedType<"blksize_t">; def BlkCntT : NamedType<"blkcnt_t">; def NLinkT : NamedType<"nlink_t">; @@ -105,6 +106,10 @@ def POSIX : StandardSpec<"POSIX"> { ConstType ConstPThreadAttrTPtr = ConstType; ConstType ConstRestrictedPThreadAttrTPtr = ConstType; + NamedType PThreadCondAttrTType = NamedType<"pthread_condattr_t">; + PtrType PThreadCondAttrTPtr = PtrType; + ConstType ConstRestrictedPThreadCondAttrTPtr = ConstType>; + NamedType PThreadMutexAttrTType = NamedType<"pthread_mutexattr_t">; PtrType PThreadMutexAttrTPtr = PtrType; RestrictedPtrType RestrictedPThreadMutexAttrTPtr = RestrictedPtrType; @@ -980,7 +985,9 @@ def POSIX : StandardSpec<"POSIX"> { [], // Macros [ AtForkCallbackT, + ClockIdT, PThreadAttrTType, + PThreadCondAttrTType, PThreadKeyT, PThreadMutexAttrTType, PThreadMutexTType, @@ -1047,6 +1054,36 @@ def POSIX : StandardSpec<"POSIX"> { RetValSpec, [ArgSpec, ArgSpec, ArgSpec] >, + FunctionSpec< + "pthread_condattr_destroy", + RetValSpec, + [ArgSpec] + >, + FunctionSpec< + "pthread_condattr_getclock", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "pthread_condattr_getpshared", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "pthread_condattr_init", + RetValSpec, + [ArgSpec] + >, + FunctionSpec< + "pthread_condattr_setclock", + RetValSpec, + [ArgSpec, ArgSpec] + >, + FunctionSpec< + "pthread_condattr_setpshared", + RetValSpec, + [ArgSpec, ArgSpec] + >, FunctionSpec< "pthread_create", RetValSpec, @@ -1522,9 +1559,30 @@ def POSIX : StandardSpec<"POSIX"> { HeaderSpec SysTypes = HeaderSpec< "sys/types.h", [], // Macros - [BlkCntT, BlkSizeT, ClockIdT, DevT, GidT, InoT, ModeTType, NLinkT, OffTType, PidT, - PThreadAttrTType, PThreadKeyT, PThreadMutexTType, PThreadMutexAttrTType, PThreadOnceT, PThreadTType, - SizeTType, SSizeTType, SuSecondsT, TimeTType, UidT], + [ + BlkCntT, + BlkSizeT, + ClockIdT, + DevT, + GidT, + InoT, + ModeTType, + NLinkT, + OffTType, + PThreadAttrTType, + PThreadCondAttrTType, + PThreadKeyT, + PThreadMutexAttrTType, + PThreadMutexTType, + PThreadOnceT, + PThreadTType, + PidT, + SSizeTType, + SizeTType, + SuSecondsT, + TimeTType, + UidT + ], // Types [], // Enumerations [] // Functions >; diff --git a/libc/src/pthread/CMakeLists.txt b/libc/src/pthread/CMakeLists.txt index d5e6c802a84523..3d6cf6dde010b1 100644 --- a/libc/src/pthread/CMakeLists.txt +++ b/libc/src/pthread/CMakeLists.txt @@ -100,6 +100,71 @@ add_entrypoint_object( libc.src.pthread.pthread_attr_setstacksize ) +add_entrypoint_object( + pthread_condattr_destroy + SRCS + pthread_condattr_destroy.cpp + HDRS + pthread_condattr_destroy.h + DEPENDS + libc.include.pthread +) + +add_entrypoint_object( + pthread_condattr_getclock + SRCS + pthread_condattr_getclock.cpp + HDRS + pthread_condattr_getclock.h + DEPENDS + libc.include.pthread + libc.include.sys_types +) + +add_entrypoint_object( + pthread_condattr_getpshared + SRCS + pthread_condattr_getpshared.cpp + HDRS + pthread_condattr_getpshared.h + DEPENDS + libc.include.pthread +) + +add_entrypoint_object( + pthread_condattr_init + SRCS + pthread_condattr_init.cpp + HDRS + pthread_condattr_init.h + DEPENDS + libc.include.pthread + libc.include.time +) + +add_entrypoint_object( + pthread_condattr_setclock + SRCS + pthread_condattr_setclock.cpp + HDRS + pthread_condattr_setclock.h + DEPENDS + libc.include.errno + libc.include.pthread + libc.include.sys_types + libc.include.time +) + +add_entrypoint_object( + pthread_condattr_setpshared + SRCS + pthread_condattr_setpshared.cpp + HDRS + pthread_condattr_setpshared.h + DEPENDS + libc.include.pthread +) + add_header_library( pthread_mutexattr HDRS diff --git a/libc/src/pthread/pthread_condattr_destroy.cpp b/libc/src/pthread/pthread_condattr_destroy.cpp new file mode 100644 index 00000000000000..45cc011a4a92d8 --- /dev/null +++ b/libc/src/pthread/pthread_condattr_destroy.cpp @@ -0,0 +1,23 @@ +//===-- Implementation of the pthread_condattr_destroy --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "pthread_condattr_destroy.h" + +#include "src/__support/common.h" + +#include + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(int, pthread_condattr_destroy, (pthread_condattr_t * attr)) { + // Initializing a pthread_condattr_t acquires no resources, so this is a + // no-op. + return 0; +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/pthread/pthread_condattr_destroy.h b/libc/src/pthread/pthread_condattr_destroy.h new file mode 100644 index 00000000000000..2910fa9f96168a --- /dev/null +++ b/libc/src/pthread/pthread_condattr_destroy.h @@ -0,0 +1,20 @@ +//===-- Implementation header for pthread_condattr_destroy ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_DESTROY_H +#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_DESTROY_H + +#include + +namespace LIBC_NAMESPACE { + +int pthread_condattr_destroy(pthread_condattr_t *attr); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_DESTROY_H diff --git a/libc/src/pthread/pthread_condattr_getclock.cpp b/libc/src/pthread/pthread_condattr_getclock.cpp new file mode 100644 index 00000000000000..a3a3963f4f429e --- /dev/null +++ b/libc/src/pthread/pthread_condattr_getclock.cpp @@ -0,0 +1,25 @@ +//===-- Implementation of the pthread_condattr_getclock -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "pthread_condattr_getclock.h" + +#include "src/__support/common.h" + +#include // pthread_condattr_t +#include // clockid_t + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(int, pthread_condattr_getclock, + (const pthread_condattr_t *__restrict attr, + clockid_t *__restrict clock_id)) { + *clock_id = attr->clock; + return 0; +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/pthread/pthread_condattr_getclock.h b/libc/src/pthread/pthread_condattr_getclock.h new file mode 100644 index 00000000000000..d5878c4f45b537 --- /dev/null +++ b/libc/src/pthread/pthread_condattr_getclock.h @@ -0,0 +1,22 @@ +//===-- Implementation header for pthread_condattr_getclock -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_GETCLOCK_H +#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_GETCLOCK_H + +#include // pthread_condattr_t +#include // clockid_t + +namespace LIBC_NAMESPACE { + +int pthread_condattr_getclock(const pthread_condattr_t *__restrict attr, + clockid_t *__restrict clock_id); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_GETCLOCK_H diff --git a/libc/src/pthread/pthread_condattr_getpshared.cpp b/libc/src/pthread/pthread_condattr_getpshared.cpp new file mode 100644 index 00000000000000..0c5fdc115c25d7 --- /dev/null +++ b/libc/src/pthread/pthread_condattr_getpshared.cpp @@ -0,0 +1,24 @@ +//===-- Implementation of the pthread_condattr_getpshared -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "pthread_condattr_getpshared.h" + +#include "src/__support/common.h" + +#include + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(int, pthread_condattr_getpshared, + (const pthread_condattr_t *__restrict attr, + int *__restrict pshared)) { + *pshared = attr->pshared; + return 0; +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/pthread/pthread_condattr_getpshared.h b/libc/src/pthread/pthread_condattr_getpshared.h new file mode 100644 index 00000000000000..3d7a0c1d357c60 --- /dev/null +++ b/libc/src/pthread/pthread_condattr_getpshared.h @@ -0,0 +1,21 @@ +//===-- Implementation header for pthread_condattr_getpshared ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_PSHARED_H +#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_PSHARED_H + +#include + +namespace LIBC_NAMESPACE { + +int pthread_condattr_getpshared(const pthread_condattr_t *__restrict attr, + int *__restrict pshared); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_PSHARED_H diff --git a/libc/src/pthread/pthread_condattr_init.cpp b/libc/src/pthread/pthread_condattr_init.cpp new file mode 100644 index 00000000000000..54633b2e3a5eaf --- /dev/null +++ b/libc/src/pthread/pthread_condattr_init.cpp @@ -0,0 +1,24 @@ +//===-- Implementation of the pthread_condattr_init -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "pthread_condattr_init.h" + +#include "src/__support/common.h" + +#include // pthread_condattr_t, PTHREAD_PROCESS_PRIVATE +#include // CLOCK_REALTIME + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(int, pthread_condattr_init, (pthread_condattr_t * attr)) { + attr->clock = CLOCK_REALTIME; + attr->pshared = PTHREAD_PROCESS_PRIVATE; + return 0; +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/pthread/pthread_condattr_init.h b/libc/src/pthread/pthread_condattr_init.h new file mode 100644 index 00000000000000..9f3c06bb6f4aef --- /dev/null +++ b/libc/src/pthread/pthread_condattr_init.h @@ -0,0 +1,20 @@ +//===-- Implementation header for pthread_condattr_init ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_INIT_H +#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_INIT_H + +#include + +namespace LIBC_NAMESPACE { + +int pthread_condattr_init(pthread_condattr_t *attr); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_INIT_H diff --git a/libc/src/pthread/pthread_condattr_setclock.cpp b/libc/src/pthread/pthread_condattr_setclock.cpp new file mode 100644 index 00000000000000..6eca8b30ef7f8e --- /dev/null +++ b/libc/src/pthread/pthread_condattr_setclock.cpp @@ -0,0 +1,30 @@ +//===-- Implementation of the pthread_condattr_setclock -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "pthread_condattr_setclock.h" + +#include "src/__support/common.h" + +#include // EINVAL +#include // pthread_condattr_t +#include // clockid_t +#include // CLOCK_MONOTONIC, CLOCK_REALTIME + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(int, pthread_condattr_setclock, + (pthread_condattr_t * attr, clockid_t clock)) { + + if (clock != CLOCK_MONOTONIC && clock != CLOCK_REALTIME) + return EINVAL; + + attr->clock = clock; + return 0; +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/pthread/pthread_condattr_setclock.h b/libc/src/pthread/pthread_condattr_setclock.h new file mode 100644 index 00000000000000..328766fe788336 --- /dev/null +++ b/libc/src/pthread/pthread_condattr_setclock.h @@ -0,0 +1,21 @@ +//===-- Implementation header for pthread_condattr_setclock -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETCLOCK_H +#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETCLOCK_H + +#include +#include // clockid_t + +namespace LIBC_NAMESPACE { + +int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETCLOCK_H diff --git a/libc/src/pthread/pthread_condattr_setpshared.cpp b/libc/src/pthread/pthread_condattr_setpshared.cpp new file mode 100644 index 00000000000000..7f1560acad843e --- /dev/null +++ b/libc/src/pthread/pthread_condattr_setpshared.cpp @@ -0,0 +1,28 @@ +//===-- Implementation of the pthread_condattr_setpshared -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "pthread_condattr_setpshared.h" + +#include "src/__support/common.h" + +#include // EINVAL +#include // pthread_condattr_t, PTHREAD_PROCESS_SHARED, PTHREAD_PROCESS_PRIVATE + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(int, pthread_condattr_setpshared, + (pthread_condattr_t * attr, int pshared)) { + + if (pshared != PTHREAD_PROCESS_SHARED && pshared != PTHREAD_PROCESS_PRIVATE) + return EINVAL; + + attr->pshared = pshared; + return 0; +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/pthread/pthread_condattr_setpshared.h b/libc/src/pthread/pthread_condattr_setpshared.h new file mode 100644 index 00000000000000..8083bdec78cc47 --- /dev/null +++ b/libc/src/pthread/pthread_condattr_setpshared.h @@ -0,0 +1,20 @@ +//===-- Implementation header for pthread_condattr_setpshared ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETPSHARED_H +#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETPSHARED_H + +#include + +namespace LIBC_NAMESPACE { + +int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETPSHARED_H diff --git a/libc/test/src/pthread/CMakeLists.txt b/libc/test/src/pthread/CMakeLists.txt index fb0d22ab9d2a57..46f38422a53745 100644 --- a/libc/test/src/pthread/CMakeLists.txt +++ b/libc/test/src/pthread/CMakeLists.txt @@ -39,3 +39,15 @@ add_libc_unittest( libc.src.pthread.pthread_mutexattr_setrobust libc.src.pthread.pthread_mutexattr_settype ) + +add_libc_unittest( + pthread_condattr_test + SUITE + libc_pthread_unittests + SRCS + pthread_condattr_test.cpp + DEPENDS + libc.include.errno + libc.include.pthread + libc.include.time + ) diff --git a/libc/test/src/pthread/pthread_condattr_test.cpp b/libc/test/src/pthread/pthread_condattr_test.cpp new file mode 100644 index 00000000000000..accb62de92e45f --- /dev/null +++ b/libc/test/src/pthread/pthread_condattr_test.cpp @@ -0,0 +1,71 @@ +//===-- Unittests for pthread_condattr_t ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "test/UnitTest/Test.h" + +#include +#include +#include + +TEST(LlvmLibcPThreadCondAttrTest, InitAndDestroy) { + pthread_condattr_t cond; + ASSERT_EQ(pthread_condattr_init(&cond), 0); + ASSERT_EQ(pthread_condattr_destroy(&cond), 0); +} + +TEST(LlvmLibcPThreadCondAttrTest, GetDefaultValues) { + pthread_condattr_t cond; + + // Invalid clock id. + clockid_t clock = 7; + // Invalid value. + int pshared = 42; + + ASSERT_EQ(pthread_condattr_init(&cond), 0); + ASSERT_EQ(pthread_condattr_getclock(&cond, &clock), 0); + ASSERT_EQ(clock, CLOCK_REALTIME); + ASSERT_EQ(pthread_condattr_getpshared(&cond, &pshared), 0); + ASSERT_EQ(pshared, PTHREAD_PROCESS_PRIVATE); + ASSERT_EQ(pthread_condattr_destroy(&cond), 0); +} + +TEST(LlvmLibcPThreadCondAttrTest, SetGoodValues) { + pthread_condattr_t cond; + + // Invalid clock id. + clockid_t clock = 7; + // Invalid value. + int pshared = 42; + + ASSERT_EQ(pthread_condattr_init(&cond), 0); + ASSERT_EQ(pthread_condattr_setclock(&cond, CLOCK_MONOTONIC), 0); + ASSERT_EQ(pthread_condattr_getclock(&cond, &clock), 0); + ASSERT_EQ(clock, CLOCK_MONOTONIC); + ASSERT_EQ(pthread_condattr_setpshared(&cond, PTHREAD_PROCESS_SHARED), 0); + ASSERT_EQ(pthread_condattr_getpshared(&cond, &pshared), 0); + ASSERT_EQ(pshared, PTHREAD_PROCESS_SHARED); + ASSERT_EQ(pthread_condattr_destroy(&cond), 0); +} + +TEST(LlvmLibcPThreadCondAttrTest, SetBadValues) { + pthread_condattr_t cond; + + // Invalid clock id. + clockid_t clock = 7; + // Invalid value. + int pshared = 42; + + ASSERT_EQ(pthread_condattr_init(&cond), 0); + ASSERT_EQ(pthread_condattr_setclock(&cond, clock), EINVAL); + ASSERT_EQ(pthread_condattr_getclock(&cond, &clock), 0); + ASSERT_EQ(clock, CLOCK_REALTIME); + ASSERT_EQ(pthread_condattr_setpshared(&cond, pshared), EINVAL); + ASSERT_EQ(pthread_condattr_getpshared(&cond, &pshared), 0); + ASSERT_EQ(pshared, PTHREAD_PROCESS_PRIVATE); + ASSERT_EQ(pthread_condattr_destroy(&cond), 0); +} From 4edeaffbf255137861f5153eb1a6183d956efede Mon Sep 17 00:00:00 2001 From: fabrizio-indirli Date: Wed, 17 Apr 2024 17:43:22 +0100 Subject: [PATCH 276/300] [mlir][tosa] Fix tosa.Resize-to-linalg lowering (#88514) --- .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 21 ++-- .../TosaToLinalg/tosa-to-linalg-resize.mlir | 112 ++++++++---------- 2 files changed, 60 insertions(+), 73 deletions(-) diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index d8dd1c93722b09..af19ebaea937d0 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -1582,17 +1582,16 @@ class GenericResizeConverter : public OpRewritePattern { } // x = x * scale_d + offset; // ix = floor(x / scale_n) - // dx = x / scale_n - ix - Value val = b.create(floatTy, in); - scaleN = b.create(floatTy, scaleN); - scaleD = b.create(floatTy, scaleD); - offset = b.create(floatTy, offset); - val = b.create(val, scaleD); - val = b.create(val, offset); - val = b.create(val, scaleN); - index = b.create(val); - delta = b.create(val, index); - index = b.create(b.getI32Type(), index); + Value val = b.create(in, scaleD); + val = b.create(val, offset); + index = b.create(val, scaleN); + + // rx = x % scale_n + // dx = rx / scale_n + Value r = b.create(val, scaleN); + Value rFp = b.create(floatTy, r); + Value scaleNfp = b.create(floatTy, scaleN); + delta = b.create(rFp, scaleNfp); }; // Compute the ix and dx values for the X and Y dimensions - int case. diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir index 468e92e2a2661f..d42d0a46692d47 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir @@ -304,42 +304,36 @@ func.func @resize_nearest_fp32(%input: tensor<1x50x48x1xf32>) -> () { // CHECK-DAG: %[[XMAX:.*]] = arith.constant 47 // CHECK: %[[Y:.+]] = arith.index_cast %[[IDX1]] // CHECK: %[[X:.+]] = arith.index_cast %[[IDX2]] - // CHECK-DAG: %[[ISCALE_Y_N:.*]] = arith.constant 64 - // CHECK-DAG: %[[ISCALE_Y_D:.*]] = arith.constant 2 - // CHECK-DAG: %[[ISCALE_X_N:.*]] = arith.constant 64 - // CHECK-DAG: %[[ISCALE_X_D:.*]] = arith.constant 2 - // CHECK-DAG: %[[IOFFSET_Y:.*]] = arith.constant -31 - // CHECK-DAG: %[[IOFFSET_X:.*]] = arith.constant -31 - // CHECK-DAG: %[[IBORDER_Y:.*]] = arith.constant 31 - // CHECK-DAG: %[[IBORDER_X:.*]] = arith.constant 31 - - // CHECK: %[[Y0:.+]] = arith.uitofp %[[Y]] - // CHECK: %[[SCALE_Y_N:.*]] = arith.uitofp %[[ISCALE_Y_N]] - // CHECK: %[[SCALE_Y_D:.*]] = arith.uitofp %[[ISCALE_Y_D]] - // CHECK: %[[OFFSET_Y:.*]] = arith.sitofp %[[IOFFSET_Y]] - // CHECK: %[[VAL_29:.*]] = arith.mulf %[[Y0]], %[[SCALE_Y_D]] - // CHECK: %[[VAL_31:.*]] = arith.addf %[[VAL_29]], %[[OFFSET_Y]] - // CHECK: %[[VAL_33:.*]] = arith.divf %[[VAL_31]], %[[SCALE_Y_N]] - // CHECK: %[[VAL_35:.*]] = math.floor %[[VAL_33]] - // CHECK: %[[D_Y:.*]] = arith.subf %[[VAL_33]], %[[VAL_35]] - // CHECK: %[[VAL_39:.*]] = arith.fptosi %[[VAL_35]] - - // CHECK: %[[X0:.+]] = arith.uitofp %[[X]] - // CHECK: %[[SCALE_X_N:.*]] = arith.uitofp %[[ISCALE_X_N]] - // CHECK: %[[SCALE_X_D:.*]] = arith.uitofp %[[ISCALE_X_D]] - // CHECK: %[[OFFSET_X:.*]] = arith.sitofp %[[IOFFSET_X]] - // CHECK: %[[VAL_30:.*]] = arith.mulf %[[X0]], %[[SCALE_X_D]] - // CHECK: %[[VAL_32:.*]] = arith.addf %[[VAL_30]], %[[OFFSET_X]] - // CHECK: %[[VAL_34:.*]] = arith.divf %[[VAL_32]], %[[SCALE_X_N]] - // CHECK: %[[VAL_36:.*]] = math.floor %[[VAL_34]] - // CHECK: %[[D_X:.*]] = arith.subf %[[VAL_34]], %[[VAL_36]] - // CHECK: %[[VAL_40:.*]] = arith.fptosi %[[VAL_36]] + // CHECK-DAG: %[[SCALE_Y_N:.*]] = arith.constant 64 + // CHECK-DAG: %[[SCALE_Y_D:.*]] = arith.constant 2 + // CHECK-DAG: %[[SCALE_X_N:.*]] = arith.constant 64 + // CHECK-DAG: %[[SCALE_X_D:.*]] = arith.constant 2 + // CHECK-DAG: %[[OFFSET_Y:.*]] = arith.constant -31 + // CHECK-DAG: %[[OFFSET_X:.*]] = arith.constant -31 + // CHECK-DAG: %[[BORDER_Y:.*]] = arith.constant 31 + // CHECK-DAG: %[[BORDER_X:.*]] = arith.constant 31 + + // CHECK: %[[VAL_29:.*]] = arith.muli %[[Y]], %[[SCALE_Y_D]] + // CHECK: %[[Y_TEMP:.*]] = arith.addi %[[VAL_29]], %[[OFFSET_Y]] + // CHECK: %[[IY_TEMP:.*]] = arith.floordivsi %[[Y_TEMP]], %[[SCALE_Y_N]] + // CHECK: %[[RY:.*]] = arith.remsi %[[Y_TEMP]], %[[SCALE_Y_N]] + // CHECK: %[[RY_FP:.*]] = arith.sitofp %[[RY]] + // CHECK: %[[SCALE_Y_N_FP:.*]] = arith.uitofp %[[SCALE_Y_N]] + // CHECK: %[[D_Y:.*]] = arith.divf %[[RY_FP]], %[[SCALE_Y_N_FP]] + + // CHECK: %[[VAL_30:.*]] = arith.muli %[[X]], %[[SCALE_X_D]] + // CHECK: %[[X_TEMP:.*]] = arith.addi %[[VAL_30]], %[[OFFSET_X]] + // CHECK: %[[IX_TEMP:.*]] = arith.floordivsi %[[X_TEMP]], %[[SCALE_X_N]] + // CHECK: %[[RX:.*]] = arith.remsi %[[X_TEMP]], %[[SCALE_X_N]] + // CHECK: %[[RX_FP:.*]] = arith.sitofp %[[RX]] + // CHECK: %[[SCALE_X_N_FP:.*]] = arith.uitofp %[[SCALE_X_N]] + // CHECK: %[[D_X:.*]] = arith.divf %[[RX_FP]], %[[SCALE_X_N_FP]] // CHECK-DAG: %[[ONE:.*]] = arith.constant 1 // CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01 // CHECK: %[[PRED_Y:.*]] = arith.cmpf oge, %[[D_Y]], %[[HALF]] // CHECK: %[[ROUND_Y:.*]] = arith.select %[[PRED_Y]], %[[ONE]], %[[ZERO]] - // CHECK: %[[VAL_48:.*]] = arith.addi %[[VAL_39]], %[[ROUND_Y]] + // CHECK: %[[VAL_48:.*]] = arith.addi %[[IY_TEMP]], %[[ROUND_Y]] // CHECK: %[[LOWER:.*]] = arith.maxsi %[[ZERO]], %[[VAL_48]] // CHECK: %[[CLAMPED:.*]] = arith.minsi %[[YMAX]], %[[LOWER]] // CHECK: %[[IDY:.*]] = arith.index_cast %[[CLAMPED]] @@ -347,7 +341,7 @@ func.func @resize_nearest_fp32(%input: tensor<1x50x48x1xf32>) -> () { // CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01 // CHECK: %[[PRED_X:.*]] = arith.cmpf oge, %[[D_X]], %[[HALF]] // CHECK: %[[ROUND_X:.*]] = arith.select %[[PRED_X]], %[[ONE]], %[[ZERO]] - // CHECK: %[[VAL_49:.*]] = arith.addi %[[VAL_40]], %[[ROUND_X]] + // CHECK: %[[VAL_49:.*]] = arith.addi %[[IX_TEMP]], %[[ROUND_X]] // CHECK: %[[LOWER:.*]] = arith.maxsi %[[ZERO]], %[[VAL_49]] // CHECK: %[[CLAMPED:.*]] = arith.minsi %[[XMAX]], %[[LOWER]] // CHECK: %[[IDX:.*]] = arith.index_cast %[[CLAMPED]] @@ -374,36 +368,30 @@ func.func @resize_bilinear_fp(%input: tensor<1x23x24x1xf32>) -> () { // CHECK-DAG: %[[X_MAX:.*]] = arith.constant 23 // CHECK: %[[Y:.+]] = arith.index_cast %[[IDX_1]] // CHECK: %[[X:.+]] = arith.index_cast %[[IDX_2]] - // CHECK-DAG: %[[ISCALE_Y_N:.*]] = arith.constant 4 - // CHECK-DAG: %[[ISCALE_Y_D:.*]] = arith.constant 1 - // CHECK-DAG: %[[ISCALE_X_N:.*]] = arith.constant 4 - // CHECK-DAG: %[[ISCALE_X_D:.*]] = arith.constant 1 - // CHECK-DAG: %[[IOFFSET_Y:.*]] = arith.constant 0 - // CHECK-DAG: %[[IOFFSET_X:.*]] = arith.constant 0 - // CHECK-DAG: %[[IBORDER_Y:.*]] = arith.constant 0 - // CHECK-DAG: %[[IBORDER_X:.*]] = arith.constant 0 - - // CHECK: %[[Y0:.+]] = arith.uitofp %[[Y]] - // CHECK: %[[SCALE_Y_N:.*]] = arith.uitofp %[[ISCALE_Y_N]] - // CHECK: %[[SCALE_Y_D:.*]] = arith.uitofp %[[ISCALE_Y_D]] - // CHECK: %[[OFFSET_Y:.*]] = arith.sitofp %[[IOFFSET_Y]] - // CHECK: %[[VAL_29:.*]] = arith.mulf %[[Y0]], %[[SCALE_Y_D]] - // CHECK: %[[VAL_31:.*]] = arith.addf %[[VAL_29]], %[[OFFSET_Y]] - // CHECK: %[[VAL_33:.*]] = arith.divf %[[VAL_31]], %[[SCALE_Y_N]] - // CHECK: %[[VAL_35:.*]] = math.floor %[[VAL_33]] - // CHECK: %[[D_Y:.*]] = arith.subf %[[VAL_33]], %[[VAL_35]] - // CHECK: %[[I_Y:.*]] = arith.fptosi %[[VAL_35]] - - // CHECK: %[[X0:.+]] = arith.uitofp %[[X]] - // CHECK: %[[SCALE_X_N:.*]] = arith.uitofp %[[ISCALE_X_N]] - // CHECK: %[[SCALE_X_D:.*]] = arith.uitofp %[[ISCALE_X_D]] - // CHECK: %[[OFFSET_X:.*]] = arith.sitofp %[[IOFFSET_X]] - // CHECK: %[[VAL_30:.*]] = arith.mulf %[[X0]], %[[SCALE_X_D]] - // CHECK: %[[VAL_32:.*]] = arith.addf %[[VAL_30]], %[[OFFSET_X]] - // CHECK: %[[VAL_34:.*]] = arith.divf %[[VAL_32]], %[[SCALE_X_N]] - // CHECK: %[[VAL_36:.*]] = math.floor %[[VAL_34]] - // CHECK: %[[D_X:.*]] = arith.subf %[[VAL_34]], %[[VAL_36]] - // CHECK: %[[I_X:.*]] = arith.fptosi %[[VAL_36]] + // CHECK-DAG: %[[SCALE_Y_N:.*]] = arith.constant 4 + // CHECK-DAG: %[[SCALE_Y_D:.*]] = arith.constant 1 + // CHECK-DAG: %[[SCALE_X_N:.*]] = arith.constant 4 + // CHECK-DAG: %[[SCALE_X_D:.*]] = arith.constant 1 + // CHECK-DAG: %[[OFFSET_Y:.*]] = arith.constant 0 + // CHECK-DAG: %[[OFFSET_X:.*]] = arith.constant 0 + // CHECK-DAG: %[[BORDER_Y:.*]] = arith.constant 0 + // CHECK-DAG: %[[BORDER_X:.*]] = arith.constant 0 + + // CHECK: %[[VAL_29:.*]] = arith.muli %[[Y]], %[[SCALE_Y_D]] + // CHECK: %[[Y_TEMP:.*]] = arith.addi %[[VAL_29]], %[[OFFSET_Y]] + // CHECK: %[[I_Y:.*]] = arith.floordivsi %[[Y_TEMP]], %[[SCALE_Y_N]] + // CHECK: %[[RY:.*]] = arith.remsi %[[Y_TEMP]], %[[SCALE_Y_N]] + // CHECK: %[[RY_FP:.*]] = arith.sitofp %[[RY]] + // CHECK: %[[SCALE_Y_N_FP:.*]] = arith.uitofp %[[SCALE_Y_N]] + // CHECK: %[[D_Y:.*]] = arith.divf %[[RY_FP]], %[[SCALE_Y_N_FP]] + + // CHECK: %[[VAL_30:.*]] = arith.muli %[[X]], %[[SCALE_X_D]] + // CHECK: %[[X_TEMP:.*]] = arith.addi %[[VAL_30]], %[[OFFSET_X]] + // CHECK: %[[I_X:.*]] = arith.floordivsi %[[X_TEMP]], %[[SCALE_X_N]] + // CHECK: %[[RX:.*]] = arith.remsi %[[X_TEMP]], %[[SCALE_X_N]] + // CHECK: %[[RX_FP:.*]] = arith.sitofp %[[RX]] + // CHECK: %[[SCALE_X_N_FP:.*]] = arith.uitofp %[[SCALE_X_N]] + // CHECK: %[[D_X:.*]] = arith.divf %[[RX_FP]], %[[SCALE_X_N_FP]] // Compute the left, right, and top indices for the bilinear interpolation. From 564f9abfcc3b99a01843f88b5a2c7309bfab5a33 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 17 Apr 2024 10:11:03 -0700 Subject: [PATCH 277/300] [bazel][mlir] Add missing dep after 4f88c2311130791cf69da34b743b1b3ba7584a7b --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 03386549a01163..58538b66c5e0c7 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -577,6 +577,7 @@ mlir_c_api_cc_library( ], includes = ["include"], deps = [ + ":IR", ":NVGPUDialect", ], ) From e59632bdfd4c70caed437216af17d335858686fd Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Wed, 17 Apr 2024 06:19:11 -0700 Subject: [PATCH 278/300] [RISCV] Fix typo in RISCVScheduleV.td that was introduced in 60a1158 --- llvm/lib/Target/RISCV/RISCVScheduleV.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td index 5993884bc2c1cc..5be06d4c3f7e70 100644 --- a/llvm/lib/Target/RISCV/RISCVScheduleV.td +++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td @@ -1079,7 +1079,7 @@ defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>; defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>; defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>; defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>; -defm "" : LMULSEWReadAdvanceFW<"SEWReadVFNCvtIToFV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>; defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>; defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>; From 676d3bafc09d0c331a04b813804407334de12917 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 17 Apr 2024 10:20:07 -0700 Subject: [PATCH 279/300] [bazel][libc] Add missing dep after b854a2323337be2633b1135f590678a17e9d1ade --- utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 6029cc3fee6108..be02c227043218 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -786,7 +786,7 @@ libc_support_library( ":errno", ":hdr_fenv_macros", ":hdr_math_macros", - ":types_fenv_t" + ":types_fenv_t", ], ) @@ -1152,6 +1152,7 @@ libc_function( deps = [ ":__support_common", ":__support_fputil_fenv_impl", + ":types_fexcept_t", ], ) @@ -1272,7 +1273,7 @@ libc_function( deps = [ ":__support_common", ":__support_fputil_fenv_impl", - ":types_fexcept_t" + ":types_fexcept_t", ], ) @@ -1283,7 +1284,7 @@ libc_function( deps = [ ":__support_common", ":__support_fputil_fenv_impl", - ":types_fexcept_t", + ":types_fexcept_t", ], ) From 693a458287d019c5c6a66fe3019d099df2978cdb Mon Sep 17 00:00:00 2001 From: Abdul Raheem <55028856+abdulraheembeigh@users.noreply.github.com> Date: Wed, 17 Apr 2024 18:24:04 +0100 Subject: [PATCH 280/300] [MLIR] Update doc comment in ViewLikeInterface.td (NFC) (#89074) Signed-off: Abdul Raheem Beigh --- mlir/include/mlir/Interfaces/ViewLikeInterface.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Interfaces/ViewLikeInterface.td b/mlir/include/mlir/Interfaces/ViewLikeInterface.td index ea5bb1b5ac4853..9397f271e1bc62 100644 --- a/mlir/include/mlir/Interfaces/ViewLikeInterface.td +++ b/mlir/include/mlir/Interfaces/ViewLikeInterface.td @@ -158,7 +158,7 @@ def OffsetSizeAndStrideOpInterface : OpInterface<"OffsetSizeAndStrideOpInterface >, InterfaceMethod< /*desc=*/[{ - Return a vector of all the static or dynamic sizes of the op. + Return a vector of all the static or dynamic offsets of the op. }], /*retTy=*/"::llvm::SmallVector<::mlir::OpFoldResult, 4>", /*methodName=*/"getMixedOffsets", From 6f7160eedb2db02f37d4ffd52fff7b0cf88b3fdc Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 16 Apr 2024 14:55:41 -0400 Subject: [PATCH 281/300] [SLP]Attempt to vectorize long stores, if short one failed. We can try to vectorize long store sequences, if short ones were unsuccessful because of the non-profitable vectorization. It should not increase compile time significantly (stores are sorted already, complexity is n x log n), but vectorize extra code. Metric: size..text Program size..text results results0 diff test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test 1088012.00 1088236.00 0.0% test-suite :: SingleSource/UnitTests/matrix-types-spec.test 480396.00 480476.00 0.0% test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 664613.00 664661.00 0.0% test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 664613.00 664661.00 0.0% test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 2041105.00 2040961.00 -0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 836563.00 836387.00 -0.0% test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 1035100.00 1032140.00 -0.3% In all benchmarks extra code gets vectorized Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/88563 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 106 ++++++++++++------ .../Transforms/SLPVectorizer/X86/pr46983.ll | 46 ++------ 2 files changed, 80 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7694627c3b0430..806e8085038b35 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15164,10 +15164,6 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - // Stores the pair of stores (first_store, last_store) in a range, that were - // already tried to be vectorized. Allows to skip the store ranges that were - // already tried to be vectorized but the attempts were unsuccessful. - DenseSet> TriedSequences; struct StoreDistCompare { bool operator()(const std::pair &Op1, const std::pair &Op2) const { @@ -15209,8 +15205,10 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, Type *ValueTy = StoreTy; if (auto *Trunc = dyn_cast(Store->getValueOperand())) ValueTy = Trunc->getSrcTy(); - unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF( - R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy)); + unsigned MinVF = std::max( + 2, PowerOf2Ceil(TTI->getStoreMinimumVF( + R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, + ValueTy))); if (MaxVF < MinVF) { LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF @@ -15236,40 +15234,74 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, VF = Size > MaxVF ? NonPowerOf2VF : Size; Size *= 2; }); - unsigned StartIdx = 0; - for (unsigned Size : CandidateVFs) { - for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { - ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); - assert( - all_of( - Slice, - [&](Value *V) { - return cast(V)->getValueOperand()->getType() == - cast(Slice.front()) - ->getValueOperand() - ->getType(); - }) && - "Expected all operands of same type."); - if (!VectorizedStores.count(Slice.front()) && - !VectorizedStores.count(Slice.back()) && - TriedSequences.insert(std::make_pair(Slice.front(), Slice.back())) - .second && - vectorizeStoreChain(Slice, R, Cnt, MinVF)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Slice.begin(), Slice.end()); - Changed = true; - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += Size; - Cnt += Size; - continue; + unsigned End = Operands.size(); + unsigned Repeat = 0; + constexpr unsigned MaxAttempts = 2; + SmallBitVector Range(Operands.size()); + while (true) { + ++Repeat; + for (unsigned Size : CandidateVFs) { + int StartIdx = Range.find_first_unset(); + while (StartIdx != -1) { + int EndIdx = Range.find_next(StartIdx); + unsigned Sz = EndIdx == -1 ? End : EndIdx; + for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) { + ArrayRef Slice = ArrayRef(Operands).slice(Cnt, Size); + assert(all_of(Slice, + [&](Value *V) { + return cast(V) + ->getValueOperand() + ->getType() == + cast(Slice.front()) + ->getValueOperand() + ->getType(); + }) && + "Expected all operands of same type."); + if (vectorizeStoreChain(Slice, R, Cnt, MinVF)) { + // Mark the vectorized stores so that we don't vectorize them + // again. + VectorizedStores.insert(Slice.begin(), Slice.end()); + // Mark the vectorized stores so that we don't vectorize them + // again. + Changed = true; + // If we vectorized initial block, no need to try to vectorize + // it again. + Range.set(Cnt, Cnt + Size); + if (Cnt < StartIdx + MinVF) + Range.set(StartIdx, Cnt); + if (Cnt > EndIdx - Size - MinVF) { + Range.set(Cnt + Size, EndIdx); + End = Cnt; + } + Cnt += Size; + continue; + } + ++Cnt; + } + if (Sz >= End) + break; + StartIdx = Range.find_next_unset(EndIdx); } - ++Cnt; } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= Operands.size()) + // All values vectorize - exit. + if (Range.all()) + break; + // Check if tried all attempts or no need for the last attempts at all. + if (Repeat >= MaxAttempts) + break; + constexpr unsigned MaxVFScale = 4; + constexpr unsigned StoresLimit = 16; + const unsigned MaxTotalNum = std::min( + std::max(StoresLimit, MaxVFScale * MaxVF), + bit_floor(static_cast(Range.find_last_unset() - + Range.find_first_unset() + 1))); + if (MaxVF >= MaxTotalNum) break; + // Last attempt to vectorize max number of elements, if all previous + // attempts were unsuccessful because of the cost issues. + CandidateVFs.clear(); + for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2) + CandidateVFs.push_back(Size); } } }; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll index 75505f632a43f3..3deab0975ce764 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -100,41 +100,17 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) { define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) { ; SSE-LABEL: @store_i64( ; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64 -; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] -; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]] -; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15 -; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 -; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255 -; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295 -; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255 -; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8 -; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]] -; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15 -; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 -; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255 -; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295 -; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255 -; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16 -; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]] -; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15 -; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 -; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255 -; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295 -; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255 -; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24 -; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]] -; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]] -; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15 -; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32 -; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255 -; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295 -; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255 -; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]] +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]] +; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer +; SSE-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] +; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], +; SSE-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; SSE-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], +; SSE-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; SSE-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> +; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64> +; SSE-NEXT: store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @store_i64( From eefee382186005d3662958e076c8e61e286ea1ab Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 17 Apr 2024 10:34:23 -0700 Subject: [PATCH 282/300] [libc] set cmake dependencies for condattr test (#89103) The entrypoints are not yet exposed on non-x86. Express this dependency to unbreak post submit. Fixes #88987 --- libc/src/pthread/pthread_condattr_destroy.cpp | 3 ++- libc/test/src/pthread/CMakeLists.txt | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/libc/src/pthread/pthread_condattr_destroy.cpp b/libc/src/pthread/pthread_condattr_destroy.cpp index 45cc011a4a92d8..41994c6941ffe3 100644 --- a/libc/src/pthread/pthread_condattr_destroy.cpp +++ b/libc/src/pthread/pthread_condattr_destroy.cpp @@ -14,7 +14,8 @@ namespace LIBC_NAMESPACE { -LLVM_LIBC_FUNCTION(int, pthread_condattr_destroy, (pthread_condattr_t * attr)) { +LLVM_LIBC_FUNCTION(int, pthread_condattr_destroy, + (pthread_condattr_t * attr [[gnu::unused]])) { // Initializing a pthread_condattr_t acquires no resources, so this is a // no-op. return 0; diff --git a/libc/test/src/pthread/CMakeLists.txt b/libc/test/src/pthread/CMakeLists.txt index 46f38422a53745..51954a5babd2c5 100644 --- a/libc/test/src/pthread/CMakeLists.txt +++ b/libc/test/src/pthread/CMakeLists.txt @@ -50,4 +50,10 @@ add_libc_unittest( libc.include.errno libc.include.pthread libc.include.time + libc.src.pthread.pthread_condattr_destroy + libc.src.pthread.pthread_condattr_getclock + libc.src.pthread.pthread_condattr_getpshared + libc.src.pthread.pthread_condattr_init + libc.src.pthread.pthread_condattr_setclock + libc.src.pthread.pthread_condattr_setpshared ) From 825536039d667eeb933c590fe40c358fdea03a8d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 17 Apr 2024 16:38:31 +0100 Subject: [PATCH 283/300] [CostModel][X86] Add basic GFNI target test coverage for shift/rotate costs --- .../Analysis/CostModel/X86/fshl-codesize.ll | 181 +++++++++++++ .../Analysis/CostModel/X86/fshl-latency.ll | 181 +++++++++++++ .../CostModel/X86/fshl-sizelatency.ll | 241 ++++++++++++++++++ llvm/test/Analysis/CostModel/X86/fshl.ll | 181 +++++++++++++ .../Analysis/CostModel/X86/fshr-codesize.ll | 181 +++++++++++++ .../Analysis/CostModel/X86/fshr-latency.ll | 181 +++++++++++++ .../CostModel/X86/fshr-sizelatency.ll | 241 ++++++++++++++++++ llvm/test/Analysis/CostModel/X86/fshr.ll | 181 +++++++++++++ .../CostModel/X86/vshift-ashr-codesize.ll | 85 ++++++ .../X86/vshift-ashr-cost-inseltpoison.ll | 87 +++++++ .../CostModel/X86/vshift-ashr-cost.ll | 87 +++++++ .../CostModel/X86/vshift-ashr-latency.ll | 93 +++++++ .../CostModel/X86/vshift-ashr-sizelatency.ll | 93 +++++++ .../CostModel/X86/vshift-lshr-codesize.ll | 65 +++++ .../X86/vshift-lshr-cost-inseltpoison.ll | 77 ++++++ .../CostModel/X86/vshift-lshr-cost.ll | 77 ++++++ .../CostModel/X86/vshift-lshr-latency.ll | 75 ++++++ .../CostModel/X86/vshift-lshr-sizelatency.ll | 77 ++++++ .../CostModel/X86/vshift-shl-codesize.ll | 57 +++++ .../X86/vshift-shl-cost-inseltpoison.ll | 81 ++++++ .../Analysis/CostModel/X86/vshift-shl-cost.ll | 81 ++++++ .../CostModel/X86/vshift-shl-latency.ll | 61 +++++ .../CostModel/X86/vshift-shl-sizelatency.ll | 73 ++++++ 23 files changed, 2737 insertions(+) diff --git a/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll b/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll index c212ecfc3eb904..c46e32ffb4ad3a 100644 --- a/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll @@ -12,6 +12,7 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) @@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) @@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) @@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) @@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer @@ -572,6 +610,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer @@ -681,6 +728,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -790,6 +846,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -881,6 +946,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -966,6 +1038,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1051,6 +1130,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1136,6 +1222,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1225,6 +1318,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -1310,6 +1410,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1395,6 +1502,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1480,6 +1594,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1690,6 +1811,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) @@ -1775,6 +1903,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) @@ -2023,6 +2158,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -2123,6 +2267,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -2335,6 +2488,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2420,6 +2580,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) @@ -2616,6 +2783,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2694,6 +2868,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) diff --git a/llvm/test/Analysis/CostModel/X86/fshl-latency.ll b/llvm/test/Analysis/CostModel/X86/fshl-latency.ll index 487adfe79d9442..fa32497c63ec7e 100644 --- a/llvm/test/Analysis/CostModel/X86/fshl-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/fshl-latency.ll @@ -12,6 +12,7 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) @@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) @@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) @@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) @@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer @@ -563,6 +601,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer @@ -663,6 +710,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -763,6 +819,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -854,6 +919,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -939,6 +1011,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1024,6 +1103,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1109,6 +1195,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1198,6 +1291,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -1276,6 +1376,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1354,6 +1461,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1432,6 +1546,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1642,6 +1763,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) @@ -1727,6 +1855,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) @@ -1975,6 +2110,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -2075,6 +2219,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -2287,6 +2440,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2372,6 +2532,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) @@ -2568,6 +2735,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2646,6 +2820,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) diff --git a/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll index 52ad7e13c84c69..832a574a9b332a 100644 --- a/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll @@ -12,6 +12,7 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512GFNI target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64 ; XOP-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) @@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3 ; XOP-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) @@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) @@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) @@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer @@ -572,6 +610,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer @@ -681,6 +728,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -790,6 +846,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -881,6 +946,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -966,6 +1038,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1051,6 +1130,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1136,6 +1222,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1225,6 +1318,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -1310,6 +1410,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1395,6 +1502,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1480,6 +1594,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1562,6 +1683,13 @@ define void @var_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) @@ -1647,6 +1775,13 @@ define void @var_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) @@ -1732,6 +1867,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) @@ -1817,6 +1959,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) @@ -1919,6 +2068,15 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer @@ -2019,6 +2177,15 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2119,6 +2286,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -2219,6 +2395,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -2303,6 +2488,13 @@ define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) @@ -2388,6 +2580,13 @@ define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) @@ -2473,6 +2672,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2558,6 +2764,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) @@ -2640,6 +2853,13 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) @@ -2718,6 +2938,13 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) @@ -2796,6 +3023,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2874,6 +3108,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) diff --git a/llvm/test/Analysis/CostModel/X86/fshl.ll b/llvm/test/Analysis/CostModel/X86/fshl.ll index 4e688c29c7ea33..311d8d5ed7d2a9 100644 --- a/llvm/test/Analysis/CostModel/X86/fshl.ll +++ b/llvm/test/Analysis/CostModel/X86/fshl.ll @@ -12,6 +12,7 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) @@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) @@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) @@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) @@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer @@ -563,6 +601,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer @@ -663,6 +710,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -763,6 +819,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -854,6 +919,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -932,6 +1004,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1017,6 +1096,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1102,6 +1188,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1191,6 +1284,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -1269,6 +1369,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5) %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1347,6 +1454,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1425,6 +1539,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1635,6 +1756,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) @@ -1720,6 +1848,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) @@ -1968,6 +2103,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -2068,6 +2212,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -2273,6 +2426,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2358,6 +2518,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) @@ -2554,6 +2721,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2632,6 +2806,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) diff --git a/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll b/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll index 0e76246fad85f0..f9d30e4ced3ec9 100644 --- a/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll @@ -12,6 +12,7 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) @@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) @@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) @@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) @@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer @@ -572,6 +610,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer @@ -681,6 +728,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -790,6 +846,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -881,6 +946,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -966,6 +1038,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1051,6 +1130,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1136,6 +1222,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1225,6 +1318,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -1310,6 +1410,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1395,6 +1502,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1480,6 +1594,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1690,6 +1811,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) @@ -1775,6 +1903,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) @@ -2023,6 +2158,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -2123,6 +2267,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -2335,6 +2488,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2420,6 +2580,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) @@ -2616,6 +2783,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2694,6 +2868,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) diff --git a/llvm/test/Analysis/CostModel/X86/fshr-latency.ll b/llvm/test/Analysis/CostModel/X86/fshr-latency.ll index 73e32dc92a69b7..ed2227591847a6 100644 --- a/llvm/test/Analysis/CostModel/X86/fshr-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/fshr-latency.ll @@ -12,6 +12,7 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) @@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) @@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) @@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) @@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer @@ -563,6 +601,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer @@ -663,6 +710,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -763,6 +819,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -854,6 +919,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -939,6 +1011,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1024,6 +1103,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1109,6 +1195,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1198,6 +1291,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -1276,6 +1376,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1354,6 +1461,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1432,6 +1546,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1642,6 +1763,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) @@ -1727,6 +1855,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) @@ -1975,6 +2110,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -2075,6 +2219,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -2287,6 +2440,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2372,6 +2532,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) @@ -2568,6 +2735,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2646,6 +2820,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) diff --git a/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll index 2d106c6dd30691..8931781f70bdce 100644 --- a/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll @@ -12,6 +12,7 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP ; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512GFNI target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64 ; XOP-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) @@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3 ; XOP-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) @@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) @@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) @@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer @@ -572,6 +610,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer @@ -681,6 +728,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -790,6 +846,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -881,6 +946,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -966,6 +1038,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1051,6 +1130,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1136,6 +1222,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1225,6 +1318,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -1310,6 +1410,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1395,6 +1502,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1480,6 +1594,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1562,6 +1683,13 @@ define void @var_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128) @@ -1647,6 +1775,13 @@ define void @var_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128) @@ -1732,6 +1867,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) @@ -1817,6 +1959,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) @@ -1919,6 +2068,15 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer @@ -2019,6 +2177,15 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer @@ -2119,6 +2286,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -2219,6 +2395,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -2303,6 +2488,13 @@ define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) @@ -2388,6 +2580,13 @@ define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) @@ -2473,6 +2672,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2558,6 +2764,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) @@ -2640,6 +2853,13 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) @@ -2718,6 +2938,13 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) @@ -2796,6 +3023,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2874,6 +3108,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) diff --git a/llvm/test/Analysis/CostModel/X86/fshr.ll b/llvm/test/Analysis/CostModel/X86/fshr.ll index 9565630677d52c..ca9ddcc52938d7 100644 --- a/llvm/test/Analysis/CostModel/X86/fshr.ll +++ b/llvm/test/Analysis/CostModel/X86/fshr.ll @@ -12,6 +12,7 @@ ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP ; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128) @@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3 ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128) @@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128) @@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128) @@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer @@ -563,6 +601,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer @@ -663,6 +710,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -763,6 +819,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -854,6 +919,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -932,6 +1004,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1017,6 +1096,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1102,6 +1188,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1191,6 +1284,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i64' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7) %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> ) @@ -1269,6 +1369,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i32' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5) %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> ) @@ -1347,6 +1454,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> ) @@ -1425,6 +1539,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_funnel_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> ) @@ -1635,6 +1756,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128) @@ -1720,6 +1848,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> % ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'var_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128) @@ -1968,6 +2103,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer @@ -2068,6 +2212,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatvar_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer @@ -2273,6 +2426,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3 ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2358,6 +2518,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'constant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) @@ -2554,6 +2721,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) @@ -2632,6 +2806,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; XOP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll index 8e3f38c796e6e5..a3c24bdd1a8820 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -232,6 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = ashr <8 x i16> %a, %b ret <8 x i16> %shift @@ -265,6 +270,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift @@ -298,6 +307,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, %b ret <32 x i16> %shift @@ -327,6 +340,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, %b ret <16 x i8> %shift @@ -389,6 +406,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift @@ -778,6 +799,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -827,6 +854,12 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %insert = insertelement <16 x i8> undef, i8 %b, i32 0 %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer @@ -882,6 +915,12 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <32 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <32 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %insert = insertelement <32 x i8> undef, i8 %b, i32 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer @@ -937,6 +976,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <64 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1158,6 +1203,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = ashr <8 x i16> %a, ret <8 x i16> %shift @@ -1191,6 +1240,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -1224,6 +1277,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1253,6 +1310,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -1315,6 +1376,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, ret <64 x i8> %shift @@ -1531,6 +1596,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -1568,6 +1637,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1601,6 +1674,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -1634,6 +1711,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -1667,6 +1748,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll index d4ece3b2a1134e..67679f2cb85666 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector arithmetic shift right instructions. @@ -228,6 +229,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v8i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; %shift = ashr <8 x i16> %a, %b ret <8 x i16> %shift @@ -269,6 +274,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift @@ -310,6 +319,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, %b ret <32 x i16> %shift @@ -351,6 +364,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v16i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, %b ret <16 x i8> %shift @@ -392,6 +409,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift @@ -433,6 +454,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift @@ -834,6 +859,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> poison, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> poison, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer @@ -950,6 +981,12 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) { ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, %splat ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> poison, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %insert = insertelement <32 x i8> poison, i8 %b, i32 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer @@ -1017,6 +1054,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <64 x i8> %a, %splat ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> poison, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> poison, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer @@ -1226,6 +1269,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v8i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; %shift = ashr <8 x i16> %a, ret <8 x i16> %shift @@ -1267,6 +1314,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -1308,6 +1359,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1349,6 +1404,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v16i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -1390,6 +1449,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -1431,6 +1494,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, ret <64 x i8> %shift @@ -1631,6 +1698,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -1672,6 +1743,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1713,6 +1788,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -1754,6 +1833,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -1795,6 +1878,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <64 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll index f8fb2d76a778fc..efd378ee0b5a61 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector arithmetic shift right instructions. @@ -228,6 +229,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v8i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; %shift = ashr <8 x i16> %a, %b ret <8 x i16> %shift @@ -269,6 +274,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift @@ -310,6 +319,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, %b ret <32 x i16> %shift @@ -351,6 +364,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v16i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, %b ret <16 x i8> %shift @@ -392,6 +409,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift @@ -433,6 +454,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift @@ -834,6 +859,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -950,6 +981,12 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) { ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, %splat ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %insert = insertelement <32 x i8> undef, i8 %b, i32 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer @@ -1017,6 +1054,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <64 x i8> %a, %splat ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1226,6 +1269,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v8i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; %shift = ashr <8 x i16> %a, ret <8 x i16> %shift @@ -1267,6 +1314,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -1308,6 +1359,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1349,6 +1404,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v16i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -1390,6 +1449,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -1431,6 +1494,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, ret <64 x i8> %shift @@ -1631,6 +1698,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -1672,6 +1743,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1713,6 +1788,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -1754,6 +1833,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -1795,6 +1878,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <64 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll index af5278e355cfb8..cd4189d4a7f84d 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -232,6 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = ashr <8 x i16> %a, %b ret <8 x i16> %shift @@ -265,6 +270,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift @@ -298,6 +307,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, %b ret <32 x i16> %shift @@ -331,6 +344,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, %b ret <16 x i8> %shift @@ -364,6 +381,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift @@ -397,6 +418,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift @@ -834,6 +859,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i16> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -889,6 +920,12 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %insert = insertelement <16 x i8> undef, i8 %b, i32 0 %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer @@ -944,6 +981,12 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <32 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <32 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %insert = insertelement <32 x i8> undef, i8 %b, i32 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer @@ -999,6 +1042,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <64 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1220,6 +1269,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = ashr <8 x i16> %a, ret <8 x i16> %shift @@ -1253,6 +1306,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -1286,6 +1343,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1319,6 +1380,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -1352,6 +1417,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -1385,6 +1454,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, ret <64 x i8> %shift @@ -1649,6 +1722,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -1686,6 +1763,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1723,6 +1804,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -1760,6 +1845,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %shift = ashr <32 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %shift = ashr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -1797,6 +1886,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %shift = ashr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %shift = ashr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll index 8286b3967af606..84ccad0294155f 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -232,6 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = ashr <8 x i16> %a, %b ret <8 x i16> %shift @@ -265,6 +270,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift @@ -298,6 +307,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, %b ret <32 x i16> %shift @@ -331,6 +344,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, %b ret <16 x i8> %shift @@ -364,6 +381,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <32 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <32 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift @@ -397,6 +418,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift @@ -786,6 +811,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = ashr <32 x i16> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = ashr <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -835,6 +866,12 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %insert = insertelement <16 x i8> undef, i8 %b, i32 0 %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer @@ -890,6 +927,12 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %insert = insertelement <32 x i8> undef, i8 %b, i32 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer @@ -945,6 +988,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1166,6 +1215,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = ashr <8 x i16> %a, ret <8 x i16> %shift @@ -1199,6 +1252,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -1232,6 +1289,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1265,6 +1326,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -1298,6 +1363,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <32 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -1331,6 +1400,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, ret <64 x i8> %shift @@ -1547,6 +1620,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -1584,6 +1661,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -1617,6 +1698,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = ashr <16 x i8> %a, ret <16 x i8> %shift @@ -1654,6 +1739,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -1691,6 +1780,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = ashr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll index 7c506af62e8178..a0e15bb8ff738e 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -240,6 +241,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = lshr <8 x i16> %a, %b ret <8 x i16> %shift @@ -273,6 +278,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift @@ -306,6 +315,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, %b ret <32 x i16> %shift @@ -335,6 +348,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, %b ret <16 x i8> %shift @@ -397,6 +414,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = lshr <64 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = lshr <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -762,6 +783,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -909,6 +936,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1138,6 +1171,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = lshr <8 x i16> %a, ret <8 x i16> %shift @@ -1171,6 +1208,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -1204,6 +1245,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1233,6 +1278,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, ret <16 x i8> %shift @@ -1295,6 +1344,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = lshr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = lshr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, ret <64 x i8> %shift @@ -1495,6 +1548,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -1532,6 +1589,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1631,6 +1692,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll index 855d2b8e88e3a5..34ea1a644f6cca 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -232,6 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v8i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; %shift = lshr <8 x i16> %a, %b ret <8 x i16> %shift @@ -273,6 +278,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift @@ -314,6 +323,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, %b ret <32 x i16> %shift @@ -355,6 +368,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v16i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, %b ret <16 x i8> %shift @@ -396,6 +413,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = lshr <32 x i8> %a, %b ret <32 x i8> %shift @@ -437,6 +458,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -826,6 +851,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> poison, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> poison, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer @@ -991,6 +1022,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <64 x i8> %a, %splat ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> poison, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> poison, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer @@ -1200,6 +1237,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v8i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; %shift = lshr <8 x i16> %a, ret <8 x i16> %shift @@ -1241,6 +1282,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -1282,6 +1327,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1323,6 +1372,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v16i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, ret <16 x i8> %shift @@ -1364,6 +1417,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = lshr <32 x i8> %a, ret <32 x i8> %shift @@ -1405,6 +1462,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, ret <64 x i8> %shift @@ -1593,6 +1654,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -1634,6 +1699,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1700,6 +1769,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = lshr <32 x i8> %a, ret <32 x i8> %shift @@ -1741,6 +1814,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <64 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll index 55894dc9329578..93b844eddf1824 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -232,6 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v8i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; %shift = lshr <8 x i16> %a, %b ret <8 x i16> %shift @@ -273,6 +278,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift @@ -314,6 +323,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BWVL-LABEL: 'var_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, %b ret <32 x i16> %shift @@ -355,6 +368,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v16i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, %b ret <16 x i8> %shift @@ -396,6 +413,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = lshr <32 x i8> %a, %b ret <32 x i8> %shift @@ -437,6 +458,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BWVL-LABEL: 'var_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %b ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -826,6 +851,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -991,6 +1022,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <64 x i8> %a, %splat ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1200,6 +1237,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v8i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; %shift = lshr <8 x i16> %a, ret <8 x i16> %shift @@ -1241,6 +1282,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -1282,6 +1327,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1323,6 +1372,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v16i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, ret <16 x i8> %shift @@ -1364,6 +1417,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = lshr <32 x i8> %a, ret <32 x i8> %shift @@ -1405,6 +1462,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BWVL-LABEL: 'constant_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, ret <64 x i8> %shift @@ -1593,6 +1654,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -1634,6 +1699,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1700,6 +1769,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = lshr <32 x i8> %a, ret <32 x i8> %shift @@ -1741,6 +1814,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <64 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll index 92fe253d38e738..61620e2cc97ed8 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -240,6 +241,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = lshr <8 x i16> %a, %b ret <8 x i16> %shift @@ -277,6 +282,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift @@ -314,6 +323,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, %b ret <32 x i16> %shift @@ -351,6 +364,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, %b ret <16 x i8> %shift @@ -421,6 +438,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %shift = lshr <64 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %shift = lshr <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -834,6 +855,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i16> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -889,6 +916,12 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = lshr <16 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = lshr <16 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %insert = insertelement <16 x i8> undef, i8 %b, i32 0 %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer @@ -993,6 +1026,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1222,6 +1261,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = lshr <8 x i16> %a, ret <8 x i16> %shift @@ -1259,6 +1302,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -1296,6 +1343,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1333,6 +1384,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, ret <16 x i8> %shift @@ -1403,6 +1458,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %shift = lshr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %shift = lshr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, ret <64 x i8> %shift @@ -1643,6 +1702,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -1676,6 +1739,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1709,6 +1776,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, ret <16 x i8> %shift @@ -1771,6 +1842,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll index fe8a7d91cc3752..e6b6ac75b65d58 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -240,6 +241,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = lshr <8 x i16> %a, %b ret <8 x i16> %shift @@ -273,6 +278,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift @@ -306,6 +315,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, %b ret <32 x i16> %shift @@ -335,6 +348,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = lshr <16 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = lshr <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, %b ret <16 x i8> %shift @@ -368,6 +385,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <32 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <32 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = lshr <32 x i8> %a, %b ret <32 x i8> %shift @@ -401,6 +422,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <64 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift @@ -766,6 +791,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <32 x i16> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -913,6 +944,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = lshr <64 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %shift = lshr <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1142,6 +1179,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = lshr <8 x i16> %a, ret <8 x i16> %shift @@ -1175,6 +1216,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -1208,6 +1253,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1237,6 +1286,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = lshr <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = lshr <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = lshr <16 x i8> %a, ret <16 x i8> %shift @@ -1270,6 +1323,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <32 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = lshr <32 x i8> %a, ret <32 x i8> %shift @@ -1303,6 +1360,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, ret <64 x i8> %shift @@ -1503,6 +1564,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -1540,6 +1605,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -1606,6 +1675,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <32 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = lshr <32 x i8> %a, ret <32 x i8> %shift @@ -1643,6 +1716,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = lshr <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll index 07229e22873c43..265658b1e3a2da 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=code-size -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -240,6 +241,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = shl <8 x i16> %a, %b ret <8 x i16> %shift @@ -273,6 +278,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift @@ -306,6 +315,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, %b ret <32 x i16> %shift @@ -335,6 +348,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = shl <16 x i8> %a, %b ret <16 x i8> %shift @@ -397,6 +414,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %shift = shl <64 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %shift = shl <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift @@ -762,6 +783,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i16> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -915,6 +942,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1186,6 +1219,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1215,6 +1252,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = shl <16 x i8> %a, ret <16 x i8> %shift @@ -1277,6 +1318,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %shift = shl <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %shift = shl <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, ret <64 x i8> %shift @@ -1477,6 +1522,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -1514,6 +1563,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1613,6 +1666,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll index d8737d2365649e..f093d1c8ca358a 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SLM ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector shift left instructions. @@ -236,6 +237,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; SLM-LABEL: 'var_shift_v8i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %shift = shl <8 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; %shift = shl <8 x i16> %a, %b ret <8 x i16> %shift @@ -281,6 +286,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SLM-LABEL: 'var_shift_v16i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %shift = shl <16 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift @@ -326,6 +335,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; SLM-LABEL: 'var_shift_v32i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %shift = shl <32 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, %b ret <32 x i16> %shift @@ -371,6 +384,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; SLM-LABEL: 'var_shift_v16i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <16 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = shl <16 x i8> %a, %b ret <16 x i8> %shift @@ -416,6 +433,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; SLM-LABEL: 'var_shift_v32i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = shl <32 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = shl <32 x i8> %a, %b ret <32 x i8> %shift @@ -461,6 +482,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; SLM-LABEL: 'var_shift_v64i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %shift = shl <64 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift @@ -868,6 +893,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i16> %a, %splat ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> poison, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> poison, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer @@ -1051,6 +1082,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %shift = shl <64 x i8> %a, %splat ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> poison, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> poison, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer @@ -1305,6 +1342,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; SLM-LABEL: 'constant_shift_v16i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i16> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -1354,6 +1395,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; SLM-LABEL: 'constant_shift_v32i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i16> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1399,6 +1444,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; SLM-LABEL: 'constant_shift_v16i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <16 x i8> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = shl <16 x i8> %a, ret <16 x i8> %shift @@ -1444,6 +1493,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; SLM-LABEL: 'constant_shift_v32i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = shl <32 x i8> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = shl <32 x i8> %a, ret <32 x i8> %shift @@ -1489,6 +1542,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; SLM-LABEL: 'constant_shift_v64i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %shift = shl <64 x i8> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, ret <64 x i8> %shift @@ -1677,6 +1734,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -1718,6 +1779,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1784,6 +1849,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = shl <32 x i8> %a, ret <32 x i8> %shift @@ -1825,6 +1894,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <64 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, ret <64 x i8> %shift @@ -2044,6 +2117,10 @@ define <16 x i16> @test6(<16 x i16> %a) { ; SLM-LABEL: 'test6' ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shl = shl <16 x i16> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shl +; +; AVX512GFNI-LABEL: 'test6' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shl = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shl ; %shl = shl <16 x i16> %a, ret <16 x i16> %shl @@ -2162,6 +2239,10 @@ define <32 x i16> @test9(<32 x i16> %a) { ; SLM-LABEL: 'test9' ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shl = shl <32 x i16> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shl +; +; AVX512GFNI-LABEL: 'test9' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shl = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shl ; %shl = shl <32 x i16> %a, ret <32 x i16> %shl diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll index b254ff27daccb9..09521cfa2d23d3 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SLM ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector shift left instructions. @@ -236,6 +237,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; SLM-LABEL: 'var_shift_v8i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %shift = shl <8 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift ; %shift = shl <8 x i16> %a, %b ret <8 x i16> %shift @@ -281,6 +286,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SLM-LABEL: 'var_shift_v16i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %shift = shl <16 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift @@ -326,6 +335,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; SLM-LABEL: 'var_shift_v32i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %shift = shl <32 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, %b ret <32 x i16> %shift @@ -371,6 +384,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; SLM-LABEL: 'var_shift_v16i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <16 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = shl <16 x i8> %a, %b ret <16 x i8> %shift @@ -416,6 +433,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; SLM-LABEL: 'var_shift_v32i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = shl <32 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = shl <32 x i8> %a, %b ret <32 x i8> %shift @@ -461,6 +482,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; SLM-LABEL: 'var_shift_v64i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %shift = shl <64 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift @@ -868,6 +893,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i16> %a, %splat ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -1051,6 +1082,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %shift = shl <64 x i8> %a, %splat ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1305,6 +1342,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; SLM-LABEL: 'constant_shift_v16i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i16> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -1354,6 +1395,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; SLM-LABEL: 'constant_shift_v32i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i16> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1399,6 +1444,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; SLM-LABEL: 'constant_shift_v16i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <16 x i8> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift ; %shift = shl <16 x i8> %a, ret <16 x i8> %shift @@ -1444,6 +1493,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; SLM-LABEL: 'constant_shift_v32i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %shift = shl <32 x i8> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = shl <32 x i8> %a, ret <32 x i8> %shift @@ -1489,6 +1542,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; SLM-LABEL: 'constant_shift_v64i8' ; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %shift = shl <64 x i8> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, ret <64 x i8> %shift @@ -1677,6 +1734,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -1718,6 +1779,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1784,6 +1849,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift ; %shift = shl <32 x i8> %a, ret <32 x i8> %shift @@ -1825,6 +1894,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8' ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <64 x i8> %a, ; AVX512BWVL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, ret <64 x i8> %shift @@ -2044,6 +2117,10 @@ define <16 x i16> @test6(<16 x i16> %a) { ; SLM-LABEL: 'test6' ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shl = shl <16 x i16> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shl +; +; AVX512GFNI-LABEL: 'test6' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shl = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shl ; %shl = shl <16 x i16> %a, ret <16 x i16> %shl @@ -2162,6 +2239,10 @@ define <32 x i16> @test9(<32 x i16> %a) { ; SLM-LABEL: 'test9' ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shl = shl <32 x i16> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shl +; +; AVX512GFNI-LABEL: 'test9' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shl = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shl ; %shl = shl <32 x i16> %a, ret <32 x i16> %shl diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll index c53dbc5f586aa4..42c91144ff6f99 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=latency -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -240,6 +241,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = shl <8 x i16> %a, %b ret <8 x i16> %shift @@ -277,6 +282,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift @@ -314,6 +323,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; AVX512BW-LABEL: 'var_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, %b ret <32 x i16> %shift @@ -351,6 +364,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; AVX512BW-LABEL: 'var_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <16 x i8> %a, %b ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = shl <16 x i8> %a, %b ret <16 x i8> %shift @@ -830,6 +847,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i16> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -983,6 +1006,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = shl <64 x i8> %a, %splat ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = shl <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1208,6 +1237,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v8i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; %shift = shl <8 x i16> %a, ret <8 x i16> %shift @@ -1241,6 +1274,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -1274,6 +1311,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1311,6 +1352,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = shl <16 x i8> %a, ret <16 x i8> %shift @@ -1617,6 +1662,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -1650,6 +1699,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1683,6 +1736,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = shl <16 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %shift = shl <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; %shift = shl <16 x i8> %a, ret <16 x i8> %shift @@ -1745,6 +1802,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, ret <64 x i8> %shift diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll index cbf0f24b90fb89..47b24df063ef70 100644 --- a/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll @@ -15,6 +15,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SLM ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print" 2>&1 -disable-output -cost-kind=size-latency -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI ; Verify the cost of vector logical shift right instructions. @@ -263,6 +264,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %shift = shl <8 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift ; +; AVX512GFNI-LABEL: 'var_shift_v8i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift +; ; BTVER2-LABEL: 'var_shift_v8i16' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %shift = shl <8 x i16> %a, %b ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift @@ -303,6 +308,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %shift = shl <16 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; +; AVX512GFNI-LABEL: 'var_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; ; BTVER2-LABEL: 'var_shift_v16i16' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %shift = shl <16 x i16> %a, %b ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift @@ -343,6 +352,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %shift = shl <32 x i16> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; +; AVX512GFNI-LABEL: 'var_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; ; BTVER2-LABEL: 'var_shift_v32i16' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %shift = shl <32 x i16> %a, %b ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift @@ -383,6 +396,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %shift = shl <16 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; +; AVX512GFNI-LABEL: 'var_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = shl <16 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; ; BTVER2-LABEL: 'var_shift_v16i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %shift = shl <16 x i8> %a, %b ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift @@ -423,6 +440,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %shift = shl <32 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; +; AVX512GFNI-LABEL: 'var_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %shift = shl <32 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; ; BTVER2-LABEL: 'var_shift_v32i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %shift = shl <32 x i8> %a, %b ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift @@ -463,6 +484,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %shift = shl <64 x i8> %a, %b ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; +; AVX512GFNI-LABEL: 'var_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <64 x i8> %a, %b +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; ; BTVER2-LABEL: 'var_shift_v64i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %shift = shl <64 x i8> %a, %b ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift @@ -824,6 +849,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i16> %a, %splat ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <32 x i16> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -995,6 +1026,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer ; SLM-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %shift = shl <64 x i8> %a, %splat ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatvar_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0 +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <64 x i8> %a, %splat +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer @@ -1238,6 +1275,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -1271,6 +1312,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'constant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'constant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1309,6 +1354,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) { ; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %shift = shl <16 x i8> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift ; +; AVX512GFNI-LABEL: 'constant_shift_v16i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %shift = shl <16 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift +; ; BTVER2-LABEL: 'constant_shift_v16i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %shift = shl <16 x i8> %a, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift @@ -1349,6 +1398,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %shift = shl <32 x i8> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; +; AVX512GFNI-LABEL: 'constant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %shift = shl <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; ; BTVER2-LABEL: 'constant_shift_v32i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %shift = shl <32 x i8> %a, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift @@ -1389,6 +1442,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; SLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %shift = shl <64 x i8> %a, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; +; AVX512GFNI-LABEL: 'constant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; ; BTVER2-LABEL: 'constant_shift_v64i8' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %shift = shl <64 x i8> %a, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift @@ -1571,6 +1628,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v16i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift ; %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -1604,6 +1665,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift ; %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -1662,6 +1727,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v32i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <32 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <32 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift ; %shift = shl <32 x i8> %a, ret <32 x i8> %shift @@ -1695,6 +1764,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; AVX512BW-LABEL: 'splatconstant_shift_v64i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <64 x i8> %a, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift +; +; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8' +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <64 x i8> %a, +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift ; %shift = shl <64 x i8> %a, ret <64 x i8> %shift From da04e4afd3cae13581cac85688fbf10a5848655f Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Wed, 17 Apr 2024 12:05:26 -0500 Subject: [PATCH 284/300] [InstCombine] Use `auto *` instead of `auto` in `visitSIToFP`; NFC --- llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index d242d3f443def9..4537a47da2ced7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -1990,7 +1990,7 @@ Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) { if (Instruction *R = commonCastTransforms(CI)) return R; if (isKnownNonNegative(CI.getOperand(0), SQ)) { - auto UI = + auto *UI = CastInst::Create(Instruction::UIToFP, CI.getOperand(0), CI.getType()); UI->setNonNeg(true); return UI; From d423d80e560d8bf7ca493596d9f34a9e1f0eede7 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 17 Apr 2024 13:36:53 -0400 Subject: [PATCH 285/300] [libc++][pstl] Promote CPU backends to top-level backends (#88968) This patch removes the two-level backend dispatching mechanism we had in the PSTL. Instead of selecting both a PSTL backend and a PSTL CPU backend, we now only select a top-level PSTL backend. This greatly simplifies the PSTL configuration layer. While this patch technically removes some flexibility from the PSTL configuration mechanism because CPU backends are not considered separately, it opens the door to a much more powerful configuration mechanism based on chained backends in a follow-up patch. This is a step towards overhauling the PSTL dispatching mechanism. --- libcxx/CMakeLists.txt | 18 +++---- libcxx/cmake/caches/Apple.cmake | 2 +- libcxx/include/CMakeLists.txt | 11 ++--- .../__algorithm/pstl_backends/cpu_backend.h | 23 --------- .../pstl_backends/cpu_backends/any_of.h | 2 +- .../pstl_backends/cpu_backends/backend.h | 45 ------------------ .../pstl_backends/cpu_backends/fill.h | 2 +- .../pstl_backends/cpu_backends/find_if.h | 2 +- .../pstl_backends/cpu_backends/for_each.h | 2 +- .../pstl_backends/cpu_backends/merge.h | 2 +- .../pstl_backends/cpu_backends/stable_sort.h | 2 +- .../pstl_backends/cpu_backends/transform.h | 2 +- .../cpu_backends/transform_reduce.h | 2 +- libcxx/include/__algorithm/pstl_copy.h | 2 +- libcxx/include/__algorithm/pstl_count.h | 2 +- libcxx/include/__algorithm/pstl_find.h | 2 +- libcxx/include/__algorithm/pstl_for_each.h | 2 +- libcxx/include/__algorithm/pstl_generate.h | 2 +- .../include/__algorithm/pstl_is_partitioned.h | 2 +- libcxx/include/__algorithm/pstl_merge.h | 2 +- libcxx/include/__algorithm/pstl_move.h | 2 +- libcxx/include/__algorithm/pstl_replace.h | 2 +- libcxx/include/__algorithm/pstl_rotate_copy.h | 2 +- libcxx/include/__algorithm/pstl_sort.h | 2 +- libcxx/include/__algorithm/pstl_stable_sort.h | 2 +- libcxx/include/__algorithm/pstl_transform.h | 2 +- libcxx/include/__config_site.in | 6 +-- .../include/__numeric/pstl_transform_reduce.h | 2 +- .../backends}/libdispatch.h | 19 ++++++-- .../cpu_backends => __pstl/backends}/serial.h | 19 ++++++-- .../thread.h => __pstl/backends/std_thread.h} | 19 ++++++-- libcxx/include/__pstl/configuration.h | 27 +++++++++++ .../configuration_fwd.h} | 25 +++++++--- libcxx/include/libcxx.imp | 6 --- libcxx/include/module.modulemap | 47 ++++++++----------- libcxx/src/CMakeLists.txt | 2 +- libcxx/src/pstl/libdispatch.cpp | 2 +- ...pstl.libdispatch.chunk_partitions.pass.cpp | 4 +- .../apple/system-install-properties.sh.cpp | 2 +- libcxx/utils/libcxx/test/features.py | 2 +- 40 files changed, 154 insertions(+), 169 deletions(-) delete mode 100644 libcxx/include/__algorithm/pstl_backends/cpu_backend.h delete mode 100644 libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h rename libcxx/include/{__algorithm/pstl_backends/cpu_backends => __pstl/backends}/libdispatch.h (94%) rename libcxx/include/{__algorithm/pstl_backends/cpu_backends => __pstl/backends}/serial.h (78%) rename libcxx/include/{__algorithm/pstl_backends/cpu_backends/thread.h => __pstl/backends/std_thread.h} (79%) create mode 100644 libcxx/include/__pstl/configuration.h rename libcxx/include/{__algorithm/pstl_backend.h => __pstl/configuration_fwd.h} (93%) diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index 043d5a8295c1a6..2977c26646cb2e 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -300,9 +300,9 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF) if (LIBCXX_ENABLE_THREADS) - set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use") + set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use") else() - set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use") + set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use") endif() # Misc options ---------------------------------------------------------------- @@ -792,14 +792,14 @@ elseif (LIBCXX_HARDENING_MODE STREQUAL "debug") config_define(8 _LIBCPP_HARDENING_MODE_DEFAULT) endif() -if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "serial") - config_define(1 _LIBCPP_PSTL_CPU_BACKEND_SERIAL) -elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "std_thread") - config_define(1 _LIBCPP_PSTL_CPU_BACKEND_THREAD) -elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch") - config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH) +if (LIBCXX_PSTL_BACKEND STREQUAL "serial") + config_define(1 _LIBCPP_PSTL_BACKEND_SERIAL) +elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread") + config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD) +elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch") + config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH) else() - message(FATAL_ERROR "LIBCXX_PSTL_CPU_BACKEND is set to ${LIBCXX_PSTL_CPU_BACKEND}, which is not a valid backend. + message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend. Valid backends are: serial, std_thread and libdispatch") endif() diff --git a/libcxx/cmake/caches/Apple.cmake b/libcxx/cmake/caches/Apple.cmake index cec13c08acf107..8768653e620add 100644 --- a/libcxx/cmake/caches/Apple.cmake +++ b/libcxx/cmake/caches/Apple.cmake @@ -7,7 +7,7 @@ set(LIBCXX_ENABLE_STATIC ON CACHE BOOL "") set(LIBCXX_ENABLE_SHARED ON CACHE BOOL "") set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "") set(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS ON CACHE BOOL "") -set(LIBCXX_PSTL_CPU_BACKEND libdispatch CACHE STRING "") +set(LIBCXX_PSTL_BACKEND libdispatch CACHE STRING "") set(LIBCXX_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "") set(LIBCXXABI_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "") diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index a2af1d9915be40..ee4979bfc6f899 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -73,18 +73,12 @@ set(files __algorithm/pop_heap.h __algorithm/prev_permutation.h __algorithm/pstl_any_all_none_of.h - __algorithm/pstl_backend.h - __algorithm/pstl_backends/cpu_backend.h __algorithm/pstl_backends/cpu_backends/any_of.h - __algorithm/pstl_backends/cpu_backends/backend.h __algorithm/pstl_backends/cpu_backends/fill.h __algorithm/pstl_backends/cpu_backends/find_if.h __algorithm/pstl_backends/cpu_backends/for_each.h - __algorithm/pstl_backends/cpu_backends/libdispatch.h __algorithm/pstl_backends/cpu_backends/merge.h - __algorithm/pstl_backends/cpu_backends/serial.h __algorithm/pstl_backends/cpu_backends/stable_sort.h - __algorithm/pstl_backends/cpu_backends/thread.h __algorithm/pstl_backends/cpu_backends/transform.h __algorithm/pstl_backends/cpu_backends/transform_reduce.h __algorithm/pstl_copy.h @@ -594,6 +588,11 @@ set(files __numeric/transform_exclusive_scan.h __numeric/transform_inclusive_scan.h __numeric/transform_reduce.h + __pstl/backends/libdispatch.h + __pstl/backends/serial.h + __pstl/backends/std_thread.h + __pstl/configuration.h + __pstl/configuration_fwd.h __pstl/cpu_algos/cpu_traits.h __random/bernoulli_distribution.h __random/binomial_distribution.h diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h deleted file mode 100644 index 53eae58f960952..00000000000000 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h +++ /dev/null @@ -1,23 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H -#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H - -#include <__algorithm/pstl_backends/cpu_backends/any_of.h> -#include <__algorithm/pstl_backends/cpu_backends/backend.h> -#include <__algorithm/pstl_backends/cpu_backends/fill.h> -#include <__algorithm/pstl_backends/cpu_backends/find_if.h> -#include <__algorithm/pstl_backends/cpu_backends/for_each.h> -#include <__algorithm/pstl_backends/cpu_backends/merge.h> -#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h> -#include <__algorithm/pstl_backends/cpu_backends/transform.h> -#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h> -#include <__config> - -#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h index 3755d288047e0b..3db4765da64b2e 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h @@ -11,12 +11,12 @@ #include <__algorithm/any_of.h> #include <__algorithm/find_if.h> -#include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__atomic/atomic.h> #include <__atomic/memory_order.h> #include <__config> #include <__functional/operations.h> #include <__iterator/concepts.h> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/move.h> diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h deleted file mode 100644 index cb9425862a2b03..00000000000000 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h +++ /dev/null @@ -1,45 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H -#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H - -#include <__config> -#include - -#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) -# include <__algorithm/pstl_backends/cpu_backends/serial.h> -#elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) -# include <__algorithm/pstl_backends/cpu_backends/thread.h> -#elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH) -# include <__algorithm/pstl_backends/cpu_backends/libdispatch.h> -#else -# error "Invalid CPU backend choice" -#endif - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 - -_LIBCPP_BEGIN_NAMESPACE_STD - -# if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) -using __cpu_backend_tag = __pstl::__serial_backend_tag; -# elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) -using __cpu_backend_tag = __pstl::__std_thread_backend_tag; -# elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH) -using __cpu_backend_tag = __pstl::__libdispatch_backend_tag; -# endif - -_LIBCPP_END_NAMESPACE_STD - -#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17 - -#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h index 0c20bdff62675a..b5a49f8417d322 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h @@ -10,9 +10,9 @@ #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_FILL_H #include <__algorithm/fill.h> -#include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__config> #include <__iterator/concepts.h> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/empty.h> diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h index 626293faef6921..2b1754ea3a7551 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h @@ -10,12 +10,12 @@ #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_FIND_IF_H #include <__algorithm/find_if.h> -#include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__atomic/atomic.h> #include <__config> #include <__functional/operations.h> #include <__iterator/concepts.h> #include <__iterator/iterator_traits.h> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/move.h> diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h index d637084e151d81..6db212ead8ae60 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h @@ -10,9 +10,9 @@ #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKNEDS_FOR_EACH_H #include <__algorithm/for_each.h> -#include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__config> #include <__iterator/concepts.h> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/empty.h> diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h index c93f4051c9d094..f3e59e8c028541 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h @@ -10,9 +10,9 @@ #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_MERGE_H #include <__algorithm/merge.h> -#include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__config> #include <__iterator/concepts.h> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/move.h> diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h index 8c60cf897ff860..9ad8cc8fb0f2da 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_STABLE_SORT_H #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_STABLE_SORT_H -#include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__algorithm/stable_sort.h> #include <__config> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/empty.h> diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h index 4b9b2968668327..65e166d847e12c 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_H #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_H -#include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__algorithm/transform.h> #include <__config> #include <__iterator/concepts.h> #include <__iterator/iterator_traits.h> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h index c074eea9861c1b..af481d505bb911 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h @@ -9,11 +9,11 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H -#include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__config> #include <__iterator/concepts.h> #include <__iterator/iterator_traits.h> #include <__numeric/transform_reduce.h> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/desugars_to.h> #include <__type_traits/is_arithmetic.h> diff --git a/libcxx/include/__algorithm/pstl_copy.h b/libcxx/include/__algorithm/pstl_copy.h index f35bb9713ef140..0fcea33c3919f0 100644 --- a/libcxx/include/__algorithm/pstl_copy.h +++ b/libcxx/include/__algorithm/pstl_copy.h @@ -10,13 +10,13 @@ #define _LIBCPP___ALGORITHM_PSTL_COPY_H #include <__algorithm/copy_n.h> -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__algorithm/pstl_transform.h> #include <__config> #include <__functional/identity.h> #include <__iterator/concepts.h> #include <__iterator/cpp17_iterator_concepts.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_constant_evaluated.h> #include <__type_traits/is_execution_policy.h> diff --git a/libcxx/include/__algorithm/pstl_count.h b/libcxx/include/__algorithm/pstl_count.h index 6ff57cac334eb0..64c84d855e4f61 100644 --- a/libcxx/include/__algorithm/pstl_count.h +++ b/libcxx/include/__algorithm/pstl_count.h @@ -11,7 +11,6 @@ #include <__algorithm/count.h> #include <__algorithm/for_each.h> -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_for_each.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__atomic/atomic.h> @@ -20,6 +19,7 @@ #include <__iterator/cpp17_iterator_concepts.h> #include <__iterator/iterator_traits.h> #include <__numeric/pstl_transform_reduce.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> diff --git a/libcxx/include/__algorithm/pstl_find.h b/libcxx/include/__algorithm/pstl_find.h index 3b30a7bc9b456f..b4c4dfb2ffb6f6 100644 --- a/libcxx/include/__algorithm/pstl_find.h +++ b/libcxx/include/__algorithm/pstl_find.h @@ -11,10 +11,10 @@ #include <__algorithm/comp.h> #include <__algorithm/find.h> -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__config> #include <__iterator/cpp17_iterator_concepts.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> diff --git a/libcxx/include/__algorithm/pstl_for_each.h b/libcxx/include/__algorithm/pstl_for_each.h index a9ebed74a62fd4..a99eb6d97fd274 100644 --- a/libcxx/include/__algorithm/pstl_for_each.h +++ b/libcxx/include/__algorithm/pstl_for_each.h @@ -11,11 +11,11 @@ #include <__algorithm/for_each.h> #include <__algorithm/for_each_n.h> -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__config> #include <__iterator/concepts.h> #include <__iterator/cpp17_iterator_concepts.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> diff --git a/libcxx/include/__algorithm/pstl_generate.h b/libcxx/include/__algorithm/pstl_generate.h index 886af290d7f25a..350c0e4798be67 100644 --- a/libcxx/include/__algorithm/pstl_generate.h +++ b/libcxx/include/__algorithm/pstl_generate.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_GENERATE_H #define _LIBCPP___ALGORITHM_PSTL_GENERATE_H -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_for_each.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__config> #include <__iterator/cpp17_iterator_concepts.h> #include <__iterator/iterator_traits.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> diff --git a/libcxx/include/__algorithm/pstl_is_partitioned.h b/libcxx/include/__algorithm/pstl_is_partitioned.h index 108bb1e4325260..c016b388e3784a 100644 --- a/libcxx/include/__algorithm/pstl_is_partitioned.h +++ b/libcxx/include/__algorithm/pstl_is_partitioned.h @@ -10,11 +10,11 @@ #define _LIBCPP___ALGORITHM_PSTL_IS_PARITTIONED #include <__algorithm/pstl_any_all_none_of.h> -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_find.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__config> #include <__iterator/cpp17_iterator_concepts.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> diff --git a/libcxx/include/__algorithm/pstl_merge.h b/libcxx/include/__algorithm/pstl_merge.h index d03cd8c7fbd580..87f634a67f5889 100644 --- a/libcxx/include/__algorithm/pstl_merge.h +++ b/libcxx/include/__algorithm/pstl_merge.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_MERGE_H #define _LIBCPP___ALGORITHM_PSTL_MERGE_H -#include <__algorithm/pstl_backend.h> #include <__config> #include <__functional/operations.h> #include <__iterator/cpp17_iterator_concepts.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> diff --git a/libcxx/include/__algorithm/pstl_move.h b/libcxx/include/__algorithm/pstl_move.h index f4c8c1fbb2e876..3155ddedf91bb6 100644 --- a/libcxx/include/__algorithm/pstl_move.h +++ b/libcxx/include/__algorithm/pstl_move.h @@ -10,13 +10,13 @@ #define _LIBCPP___ALGORITHM_PSTL_MOVE_H #include <__algorithm/copy_n.h> -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__algorithm/pstl_transform.h> #include <__config> #include <__functional/identity.h> #include <__iterator/cpp17_iterator_concepts.h> #include <__iterator/iterator_traits.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_constant_evaluated.h> #include <__type_traits/is_execution_policy.h> diff --git a/libcxx/include/__algorithm/pstl_replace.h b/libcxx/include/__algorithm/pstl_replace.h index 73ac11cda26a9f..b2ded54dfe25f3 100644 --- a/libcxx/include/__algorithm/pstl_replace.h +++ b/libcxx/include/__algorithm/pstl_replace.h @@ -9,13 +9,13 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_REPLACE_H #define _LIBCPP___ALGORITHM_PSTL_REPLACE_H -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_for_each.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__algorithm/pstl_transform.h> #include <__config> #include <__iterator/cpp17_iterator_concepts.h> #include <__iterator/iterator_traits.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/remove_cvref.h> #include <__utility/move.h> diff --git a/libcxx/include/__algorithm/pstl_rotate_copy.h b/libcxx/include/__algorithm/pstl_rotate_copy.h index adab3958fe3112..1a32b710877c16 100644 --- a/libcxx/include/__algorithm/pstl_rotate_copy.h +++ b/libcxx/include/__algorithm/pstl_rotate_copy.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_ROTATE_COPY_H #define _LIBCPP___ALGORITHM_PSTL_ROTATE_COPY_H -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_copy.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__iterator/cpp17_iterator_concepts.h> +#include <__pstl/configuration.h> #include <__type_traits/is_execution_policy.h> #include diff --git a/libcxx/include/__algorithm/pstl_sort.h b/libcxx/include/__algorithm/pstl_sort.h index 65bc794ca6f4c8..769dd81af77e04 100644 --- a/libcxx/include/__algorithm/pstl_sort.h +++ b/libcxx/include/__algorithm/pstl_sort.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_SORT_H #define _LIBCPP___ALGORITHM_PSTL_SORT_H -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__algorithm/pstl_stable_sort.h> #include <__config> #include <__functional/operations.h> #include <__iterator/cpp17_iterator_concepts.h> +#include <__pstl/configuration.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> #include <__utility/empty.h> diff --git a/libcxx/include/__algorithm/pstl_stable_sort.h b/libcxx/include/__algorithm/pstl_stable_sort.h index 79b94557e3dc3a..f5e0dd40f72b47 100644 --- a/libcxx/include/__algorithm/pstl_stable_sort.h +++ b/libcxx/include/__algorithm/pstl_stable_sort.h @@ -9,10 +9,10 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_STABLE_SORT_H #define _LIBCPP___ALGORITHM_PSTL_STABLE_SORT_H -#include <__algorithm/pstl_backend.h> #include <__config> #include <__functional/operations.h> #include <__iterator/cpp17_iterator_concepts.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> diff --git a/libcxx/include/__algorithm/pstl_transform.h b/libcxx/include/__algorithm/pstl_transform.h index a01a64a43cf1a3..80e1d6b496f2ea 100644 --- a/libcxx/include/__algorithm/pstl_transform.h +++ b/libcxx/include/__algorithm/pstl_transform.h @@ -9,9 +9,9 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_TRANSFORM_H #define _LIBCPP___ALGORITHM_PSTL_TRANSFORM_H -#include <__algorithm/pstl_backend.h> #include <__config> #include <__iterator/cpp17_iterator_concepts.h> +#include <__pstl/configuration.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in index 7c002c5bfcf8e7..89a14609ee3f92 100644 --- a/libcxx/include/__config_site.in +++ b/libcxx/include/__config_site.in @@ -32,9 +32,9 @@ #cmakedefine _LIBCPP_INSTRUMENTED_WITH_ASAN // PSTL backends -#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL -#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD -#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH +#cmakedefine _LIBCPP_PSTL_BACKEND_SERIAL +#cmakedefine _LIBCPP_PSTL_BACKEND_STD_THREAD +#cmakedefine _LIBCPP_PSTL_BACKEND_LIBDISPATCH // Hardening. #cmakedefine _LIBCPP_HARDENING_MODE_DEFAULT @_LIBCPP_HARDENING_MODE_DEFAULT@ diff --git a/libcxx/include/__numeric/pstl_transform_reduce.h b/libcxx/include/__numeric/pstl_transform_reduce.h index 2d2621dc8dadb1..fe41b1c86f3b1f 100644 --- a/libcxx/include/__numeric/pstl_transform_reduce.h +++ b/libcxx/include/__numeric/pstl_transform_reduce.h @@ -9,12 +9,12 @@ #ifndef _LIBCPP___NUMERIC_PSTL_TRANSFORM_REDUCE_H #define _LIBCPP___NUMERIC_PSTL_TRANSFORM_REDUCE_H -#include <__algorithm/pstl_backend.h> #include <__algorithm/pstl_frontend_dispatch.h> #include <__config> #include <__functional/operations.h> #include <__iterator/cpp17_iterator_concepts.h> #include <__numeric/transform_reduce.h> +#include <__pstl/configuration.h> #include <__type_traits/is_execution_policy.h> #include <__utility/move.h> #include diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__pstl/backends/libdispatch.h similarity index 94% rename from libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h rename to libcxx/include/__pstl/backends/libdispatch.h index 17faadf55dd4fa..977b06b9a489c5 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h +++ b/libcxx/include/__pstl/backends/libdispatch.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H -#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H +#ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H +#define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H #include <__algorithm/inplace_merge.h> #include <__algorithm/lower_bound.h> @@ -23,6 +23,7 @@ #include <__memory/construct_at.h> #include <__memory/unique_ptr.h> #include <__numeric/reduce.h> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__utility/empty.h> #include <__utility/exception_guard.h> @@ -40,8 +41,6 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { -struct __libdispatch_backend_tag {}; - namespace __libdispatch { // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do // we. @@ -349,4 +348,14 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS -#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H +// Implement PSTL algorithms based on the __cpu_traits specialized above +#include <__algorithm/pstl_backends/cpu_backends/any_of.h> +#include <__algorithm/pstl_backends/cpu_backends/fill.h> +#include <__algorithm/pstl_backends/cpu_backends/find_if.h> +#include <__algorithm/pstl_backends/cpu_backends/for_each.h> +#include <__algorithm/pstl_backends/cpu_backends/merge.h> +#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h> +#include <__algorithm/pstl_backends/cpu_backends/transform.h> +#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h> + +#endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h b/libcxx/include/__pstl/backends/serial.h similarity index 78% rename from libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h rename to libcxx/include/__pstl/backends/serial.h index 7544619a8eefd8..8bb89450930968 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h +++ b/libcxx/include/__pstl/backends/serial.h @@ -7,10 +7,11 @@ // //===----------------------------------------------------------------------===// -#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_SERIAL_H -#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_SERIAL_H +#ifndef _LIBCPP___PSTL_BACKENDS_SERIAL_H +#define _LIBCPP___PSTL_BACKENDS_SERIAL_H #include <__config> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__utility/empty.h> #include <__utility/move.h> @@ -29,8 +30,6 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { -struct __serial_backend_tag {}; - template <> struct __cpu_traits<__serial_backend_tag> { template @@ -82,4 +81,14 @@ _LIBCPP_POP_MACROS #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17 -#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_SERIAL_H +// Implement PSTL algorithms based on the __cpu_traits specialized above +#include <__algorithm/pstl_backends/cpu_backends/any_of.h> +#include <__algorithm/pstl_backends/cpu_backends/fill.h> +#include <__algorithm/pstl_backends/cpu_backends/find_if.h> +#include <__algorithm/pstl_backends/cpu_backends/for_each.h> +#include <__algorithm/pstl_backends/cpu_backends/merge.h> +#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h> +#include <__algorithm/pstl_backends/cpu_backends/transform.h> +#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h> + +#endif // _LIBCPP___PSTL_BACKENDS_SERIAL_H diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h b/libcxx/include/__pstl/backends/std_thread.h similarity index 79% rename from libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h rename to libcxx/include/__pstl/backends/std_thread.h index 2acf912264a001..ab09f42cfdd8d9 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h +++ b/libcxx/include/__pstl/backends/std_thread.h @@ -6,11 +6,12 @@ // //===----------------------------------------------------------------------===// -#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_THREAD_H -#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_THREAD_H +#ifndef _LIBCPP___PSTL_BACKENDS_STD_THREAD_H +#define _LIBCPP___PSTL_BACKENDS_STD_THREAD_H #include <__assert> #include <__config> +#include <__pstl/configuration_fwd.h> #include <__pstl/cpu_algos/cpu_traits.h> #include <__utility/empty.h> #include <__utility/move.h> @@ -32,8 +33,6 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { -struct __std_thread_backend_tag {}; - template <> struct __cpu_traits<__std_thread_backend_tag> { template @@ -85,4 +84,14 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS -#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_THREAD_H +// Implement PSTL algorithms based on the __cpu_traits specialized above +#include <__algorithm/pstl_backends/cpu_backends/any_of.h> +#include <__algorithm/pstl_backends/cpu_backends/fill.h> +#include <__algorithm/pstl_backends/cpu_backends/find_if.h> +#include <__algorithm/pstl_backends/cpu_backends/for_each.h> +#include <__algorithm/pstl_backends/cpu_backends/merge.h> +#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h> +#include <__algorithm/pstl_backends/cpu_backends/transform.h> +#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h> + +#endif // _LIBCPP___PSTL_BACKENDS_STD_THREAD_H diff --git a/libcxx/include/__pstl/configuration.h b/libcxx/include/__pstl/configuration.h new file mode 100644 index 00000000000000..d32bd21df1f9e5 --- /dev/null +++ b/libcxx/include/__pstl/configuration.h @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___PSTL_CONFIGURATION_H +#define _LIBCPP___PSTL_CONFIGURATION_H + +#include <__config> +#include <__pstl/configuration_fwd.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#if defined(_LIBCPP_PSTL_BACKEND_SERIAL) +# include <__pstl/backends/serial.h> +#elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) +# include <__pstl/backends/std_thread.h> +#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) +# include <__pstl/backends/libdispatch.h> +#endif + +#endif // _LIBCPP___PSTL_CONFIGURATION_H diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__pstl/configuration_fwd.h similarity index 93% rename from libcxx/include/__algorithm/pstl_backend.h rename to libcxx/include/__pstl/configuration_fwd.h index 3af03ce2fbc8ee..995fcfce847cbc 100644 --- a/libcxx/include/__algorithm/pstl_backend.h +++ b/libcxx/include/__pstl/configuration_fwd.h @@ -6,10 +6,9 @@ // //===----------------------------------------------------------------------===// -#ifndef _LIBCPP___ALGORITHM_PSTL_BACKEND_H -#define _LIBCPP___ALGORITHM_PSTL_BACKEND_H +#ifndef _LIBCPP___PSTL_CONFIGURATION_FWD_H +#define _LIBCPP___PSTL_CONFIGURATION_FWD_H -#include <__algorithm/pstl_backends/cpu_backend.h> #include <__config> #include @@ -191,6 +190,20 @@ into a program termination at the front-end level. When a backend returns a dise frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user. */ +namespace __pstl { +struct __libdispatch_backend_tag {}; +struct __serial_backend_tag {}; +struct __std_thread_backend_tag {}; +} // namespace __pstl + +# if defined(_LIBCPP_PSTL_BACKEND_SERIAL) +using __cpu_backend_tag = __pstl::__serial_backend_tag; +# elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) +using __cpu_backend_tag = __pstl::__std_thread_backend_tag; +# elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) +using __cpu_backend_tag = __pstl::__libdispatch_backend_tag; +# endif + template struct __select_backend; @@ -206,8 +219,8 @@ struct __select_backend { }; # endif -# if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) || \ - defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH) +# if defined(_LIBCPP_PSTL_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) || \ + defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) template <> struct __select_backend { using type = __cpu_backend_tag; @@ -229,4 +242,4 @@ _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 -#endif // _LIBCPP___ALGORITHM_PSTL_BACKEND_H +#endif // _LIBCPP___PSTL_CONFIGURATION_FWD_H diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp index 8820fb8c0936f9..a4e2690fc55c9a 100644 --- a/libcxx/include/libcxx.imp +++ b/libcxx/include/libcxx.imp @@ -73,18 +73,12 @@ { include: [ "<__algorithm/pop_heap.h>", "private", "", "public" ] }, { include: [ "<__algorithm/prev_permutation.h>", "private", "", "public" ] }, { include: [ "<__algorithm/pstl_any_all_none_of.h>", "private", "", "public" ] }, - { include: [ "<__algorithm/pstl_backend.h>", "private", "", "public" ] }, - { include: [ "<__algorithm/pstl_backends/cpu_backend.h>", "private", "", "public" ] }, { include: [ "<__algorithm/pstl_backends/cpu_backends/any_of.h>", "private", "", "public" ] }, - { include: [ "<__algorithm/pstl_backends/cpu_backends/backend.h>", "private", "", "public" ] }, { include: [ "<__algorithm/pstl_backends/cpu_backends/fill.h>", "private", "", "public" ] }, { include: [ "<__algorithm/pstl_backends/cpu_backends/find_if.h>", "private", "", "public" ] }, { include: [ "<__algorithm/pstl_backends/cpu_backends/for_each.h>", "private", "", "public" ] }, - { include: [ "<__algorithm/pstl_backends/cpu_backends/libdispatch.h>", "private", "", "public" ] }, { include: [ "<__algorithm/pstl_backends/cpu_backends/merge.h>", "private", "", "public" ] }, - { include: [ "<__algorithm/pstl_backends/cpu_backends/serial.h>", "private", "", "public" ] }, { include: [ "<__algorithm/pstl_backends/cpu_backends/stable_sort.h>", "private", "", "public" ] }, - { include: [ "<__algorithm/pstl_backends/cpu_backends/thread.h>", "private", "", "public" ] }, { include: [ "<__algorithm/pstl_backends/cpu_backends/transform.h>", "private", "", "public" ] }, { include: [ "<__algorithm/pstl_backends/cpu_backends/transform_reduce.h>", "private", "", "public" ] }, { include: [ "<__algorithm/pstl_copy.h>", "private", "", "public" ] }, diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index ce133e471deb70..f996c2cc05459a 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -714,32 +714,14 @@ module std_private_algorithm_partition_point [system module std_private_algorithm_pop_heap [system] { header "__algorithm/pop_heap.h" } module std_private_algorithm_prev_permutation [system] { header "__algorithm/prev_permutation.h" } module std_private_algorithm_pstl_any_all_none_of [system] { header "__algorithm/pstl_any_all_none_of.h" } -module std_private_algorithm_pstl_backend [system] { - header "__algorithm/pstl_backend.h" - export * -} -module std_private_algorithm_pstl_backends_cpu_backend [system] { - header "__algorithm/pstl_backends/cpu_backend.h" - export * -} -module std_private_algorithm_pstl_backends_cpu_backends_any_of [system] { header "__algorithm/pstl_backends/cpu_backends/any_of.h" } -module std_private_algorithm_pstl_backends_cpu_backends_backend [system] { - header "__algorithm/pstl_backends/cpu_backends/backend.h" - export * -} -module std_private_algorithm_pstl_backends_cpu_backends_fill [system] { header "__algorithm/pstl_backends/cpu_backends/fill.h" } -module std_private_algorithm_pstl_backends_cpu_backends_find_if [system] { header "__algorithm/pstl_backends/cpu_backends/find_if.h" } -module std_private_algorithm_pstl_backends_cpu_backends_for_each [system] { header "__algorithm/pstl_backends/cpu_backends/for_each.h" } -module std_private_algorithm_pstl_backends_cpu_backends_libdispatch [system] { header "__algorithm/pstl_backends/cpu_backends/libdispatch.h" } -module std_private_algorithm_pstl_backends_cpu_backends_merge [system] { header "__algorithm/pstl_backends/cpu_backends/merge.h" } -module std_private_algorithm_pstl_backends_cpu_backends_serial [system] { textual header "__algorithm/pstl_backends/cpu_backends/serial.h" } -module std_private_algorithm_pstl_backends_cpu_backends_stable_sort [system] { header "__algorithm/pstl_backends/cpu_backends/stable_sort.h" } -module std_private_algorithm_pstl_backends_cpu_backends_thread [system] { textual header "__algorithm/pstl_backends/cpu_backends/thread.h" } -module std_private_algorithm_pstl_backends_cpu_backends_transform [system] { - header "__algorithm/pstl_backends/cpu_backends/transform.h" - export std_private_algorithm_transform -} -module std_private_algorithm_pstl_backends_cpu_backends_transform_reduce [system] { header "__algorithm/pstl_backends/cpu_backends/transform_reduce.h" } +module std_private_algorithm_pstl_backends_cpu_backends_any_of [system] { textual header "__algorithm/pstl_backends/cpu_backends/any_of.h" } +module std_private_algorithm_pstl_backends_cpu_backends_fill [system] { textual header "__algorithm/pstl_backends/cpu_backends/fill.h" } +module std_private_algorithm_pstl_backends_cpu_backends_find_if [system] { textual header "__algorithm/pstl_backends/cpu_backends/find_if.h" } +module std_private_algorithm_pstl_backends_cpu_backends_for_each [system] { textual header "__algorithm/pstl_backends/cpu_backends/for_each.h" } +module std_private_algorithm_pstl_backends_cpu_backends_merge [system] { textual header "__algorithm/pstl_backends/cpu_backends/merge.h" } +module std_private_algorithm_pstl_backends_cpu_backends_stable_sort [system] { textual header "__algorithm/pstl_backends/cpu_backends/stable_sort.h" } +module std_private_algorithm_pstl_backends_cpu_backends_transform [system] { textual header "__algorithm/pstl_backends/cpu_backends/transform.h" } +module std_private_algorithm_pstl_backends_cpu_backends_transform_reduce [system] { textual header "__algorithm/pstl_backends/cpu_backends/transform_reduce.h" } module std_private_algorithm_pstl_copy [system] { header "__algorithm/pstl_copy.h" } module std_private_algorithm_pstl_count [system] { header "__algorithm/pstl_count.h" } module std_private_algorithm_pstl_equal [system] { header "__algorithm/pstl_equal.h" } @@ -1613,7 +1595,18 @@ module std_private_numeric_transform_exclusive_scan [system] { header "__numeric module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" } module std_private_numeric_transform_reduce [system] { header "__numeric/transform_reduce.h" } -module std_private_pstl_cpu_algos_cpu_traits [system] { header "__pstl/cpu_algos/cpu_traits.h" } +module std_private_pstl_backends_libdispatch [system] { header "__pstl/backends/libdispatch.h" } +module std_private_pstl_backends_serial [system] { header "__pstl/backends/serial.h" } +module std_private_pstl_backends_std_thread [system] { header "__pstl/backends/std_thread.h" } +module std_private_pstl_cpu_algos_cpu_traits [system] { header "__pstl/cpu_algos/cpu_traits.h" } +module std_private_pstl_configuration_fwd [system] { + header "__pstl/configuration_fwd.h" + export * +} +module std_private_pstl_configuration [system] { + header "__pstl/configuration.h" + export * +} module std_private_queue_fwd [system] { header "__fwd/queue.h" } diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt index a4a3fee8645710..8b28d1b8918955 100644 --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -327,7 +327,7 @@ set(LIBCXX_EXPERIMENTAL_SOURCES experimental/keep.cpp ) -if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch") +if (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch") list(APPEND LIBCXX_EXPERIMENTAL_SOURCES pstl/libdispatch.cpp ) diff --git a/libcxx/src/pstl/libdispatch.cpp b/libcxx/src/pstl/libdispatch.cpp index d997a9c73463d3..3dca702341c85a 100644 --- a/libcxx/src/pstl/libdispatch.cpp +++ b/libcxx/src/pstl/libdispatch.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include <__algorithm/min.h> -#include <__algorithm/pstl_backends/cpu_backends/libdispatch.h> #include <__config> +#include <__pstl/backends/libdispatch.h> #include _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp index 8c7016a80b811a..b48ac02dd79c59 100644 --- a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp +++ b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp @@ -8,11 +8,11 @@ // -// REQUIRES: libcpp-pstl-cpu-backend-libdispatch +// REQUIRES: libcpp-pstl-backend-libdispatch // __chunk_partitions __partition_chunks(ptrdiff_t); -#include <__algorithm/pstl_backends/cpu_backends/libdispatch.h> +#include <__pstl/backends/libdispatch.h> #include #include diff --git a/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp b/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp index 3e2e080368f4c2..4ea27401e35d4d 100644 --- a/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp +++ b/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp @@ -45,4 +45,4 @@ // Make sure we use the libdispatch backend for the PSTL. // -// RUN: grep "%{include-dir}/__config_site" -e '#define _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH' +// RUN: grep "%{include-dir}/__config_site" -e '#define _LIBCPP_PSTL_BACKEND_LIBDISPATCH' diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py index 6ff16309546bae..c81b56b1af5477 100644 --- a/libcxx/utils/libcxx/test/features.py +++ b/libcxx/utils/libcxx/test/features.py @@ -318,7 +318,7 @@ def _getAndroidDeviceApi(cfg): "_LIBCPP_HAS_NO_WIDE_CHARACTERS": "no-wide-characters", "_LIBCPP_HAS_NO_TIME_ZONE_DATABASE": "no-tzdb", "_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode", - "_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH": "libcpp-pstl-cpu-backend-libdispatch", + "_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch", } for macro, feature in macros.items(): DEFAULT_FEATURES.append( From 8d49ce176414cd4d0d5d276fd721d9226e17e810 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 17 Apr 2024 18:38:24 +0100 Subject: [PATCH 286/300] [GlobalISel][AArch64] Add LLRINT support (#88702) This hooks up G_INTRINSIC_LLRINT instructions, very similar to the lrint nodes that already exist. On AArch64 they are treated the same as lrint with the default return types. --- llvm/include/llvm/Support/TargetOpcodes.def | 3 + llvm/include/llvm/Target/GenericOpcodes.td | 6 ++ .../Target/GlobalISel/SelectionDAGCompat.td | 1 + llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 2 + .../CodeGen/GlobalISel/LegalizerHelper.cpp | 6 +- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 2 +- .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 1 + .../AArch64/GlobalISel/arm64-irtranslator.ll | 10 ++ .../AArch64/GlobalISel/legalize-llrint.mir | 98 +++++++++++++++++++ .../GlobalISel/legalizer-info-validation.mir | 4 + llvm/test/CodeGen/AArch64/llrint-conv-fp16.ll | 2 + llvm/test/CodeGen/AArch64/llrint-conv.ll | 58 +++++++---- .../builtins/match-table-replacerreg.td | 22 ++--- .../match-table-imms.td | 30 +++--- .../match-table-intrinsics.td | 16 +-- .../match-table-patfrag-root.td | 30 +++--- .../GlobalISelCombinerEmitter/match-table.td | 62 ++++++------ llvm/test/TableGen/GlobalISelEmitter.td | 2 +- 18 files changed, 251 insertions(+), 104 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-llrint.mir diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 5765926d6d93d3..cb98f96af522f7 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -351,6 +351,9 @@ HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUND) /// INTRINSIC round to integer intrinsic. HANDLE_TARGET_OPCODE(G_INTRINSIC_LRINT) +/// INTRINSIC long round to integer intrinsic. +HANDLE_TARGET_OPCODE(G_INTRINSIC_LLRINT) + /// INTRINSIC roundeven intrinsic. HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUNDEVEN) diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index d0f471eb29b6fd..e8cf8fcb647f45 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -1089,6 +1089,12 @@ def G_INTRINSIC_LRINT : GenericInstruction { let hasSideEffects = false; } +def G_INTRINSIC_LLRINT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type1:$src); + let hasSideEffects = false; +} + def G_INTRINSIC_ROUNDEVEN : GenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$src1); diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index dd4e7d790bc6b3..8fa0e4b86d6dc9 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -157,6 +157,7 @@ def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 9b4575f7f34d47..0b6aae3759756f 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1955,6 +1955,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) { return TargetOpcode::G_PTRMASK; case Intrinsic::lrint: return TargetOpcode::G_INTRINSIC_LRINT; + case Intrinsic::llrint: + return TargetOpcode::G_INTRINSIC_LLRINT; // FADD/FMUL require checking the FMF, so are handled elsewhere. case Intrinsic::vector_reduce_fmin: return TargetOpcode::G_VECREDUCE_FMIN; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 156353296cfc12..d55091e2e71739 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -474,6 +474,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { RTLIBCASE(ROUNDEVEN_F); case TargetOpcode::G_INTRINSIC_LRINT: RTLIBCASE(LRINT_F); + case TargetOpcode::G_INTRINSIC_LLRINT: + RTLIBCASE(LLRINT_F); } llvm_unreachable("Unknown libcall function"); } @@ -1061,7 +1063,8 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { return Status; break; } - case TargetOpcode::G_INTRINSIC_LRINT: { + case TargetOpcode::G_INTRINSIC_LRINT: + case TargetOpcode::G_INTRINSIC_LLRINT: { LLT LLTy = MRI.getType(MI.getOperand(1).getReg()); unsigned Size = LLTy.getSizeInBits(); Type *HLTy = getFloatTypeForLLT(Ctx, LLTy); @@ -2661,6 +2664,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_FPTOSI: case TargetOpcode::G_FPTOUI: case TargetOpcode::G_INTRINSIC_LRINT: + case TargetOpcode::G_INTRINSIC_LLRINT: case TargetOpcode::G_IS_FPCLASS: Observer.changingInstr(MI); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 661ea151d1a0ce..85dd0f2eb192d9 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -262,7 +262,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .minScalar(0, s32) .scalarize(0); - getActionDefinitionsBuilder(G_INTRINSIC_LRINT) + getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT}) .legalFor({{s64, MinFPScalar}, {s64, s32}, {s64, s64}}) .libcallFor({{s64, s128}}) .minScalarOrElt(1, MinFPScalar); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index d5c4ce1888e78c..44ba9f0429e671 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -793,6 +793,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_FPTOSI: case TargetOpcode::G_FPTOUI: case TargetOpcode::G_INTRINSIC_LRINT: + case TargetOpcode::G_INTRINSIC_LLRINT: if (MRI.getType(MI.getOperand(0).getReg()).isVector()) break; OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR}; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index a131f35e66d033..a61931b898aea5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -1385,6 +1385,16 @@ define i32 @test_intrinsic_lrint(float %a) { ret i32 %res } +declare i32 @llvm.llrint.i32.f32(float) +define i32 @test_intrinsic_llrint(float %a) { +; CHECK-LABEL: name: test_intrinsic_llrint +; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0 +; CHECK: [[RES:%[0-9]+]]:_(s32) = G_INTRINSIC_LLRINT [[A]] +; CHECK: $w0 = COPY [[RES]] + %res = call i32 @llvm.llrint.i32.f32(float %a) + ret i32 %res +} + declare i32 @llvm.ctlz.i32(i32, i1) define i32 @test_ctlz_intrinsic_zero_not_undef(i32 %a) { ; CHECK-LABEL: name: test_ctlz_intrinsic_zero_not_undef diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-llrint.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-llrint.mir new file mode 100644 index 00000000000000..f77649b793951d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-llrint.mir @@ -0,0 +1,98 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-unknown -run-pass=legalizer %s -o - | FileCheck %s +--- +name: testmsws +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$s0' } +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: testmsws + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: [[INTRINSIC_LLRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LLRINT [[COPY]](s32) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[INTRINSIC_LLRINT]](s64) + ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s32) = COPY $s0 + %1:_(s64) = G_INTRINSIC_LLRINT %0(s32) + %2:_(s32) = G_TRUNC %1(s64) + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: testmsxs +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$s0' } +body: | + bb.1: + liveins: $s0 + + ; CHECK-LABEL: name: testmsxs + ; CHECK: liveins: $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: [[INTRINSIC_LLRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LLRINT [[COPY]](s32) + ; CHECK-NEXT: $x0 = COPY [[INTRINSIC_LLRINT]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s32) = COPY $s0 + %1:_(s64) = G_INTRINSIC_LLRINT %0(s32) + $x0 = COPY %1(s64) + RET_ReallyLR implicit $x0 + +... +--- +name: testmswd +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$d0' } +body: | + bb.1: + liveins: $d0 + + ; CHECK-LABEL: name: testmswd + ; CHECK: liveins: $d0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK-NEXT: [[INTRINSIC_LLRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LLRINT [[COPY]](s64) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[INTRINSIC_LLRINT]](s64) + ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %0:_(s64) = COPY $d0 + %1:_(s64) = G_INTRINSIC_LLRINT %0(s64) + %2:_(s32) = G_TRUNC %1(s64) + $w0 = COPY %2(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: testmsxd +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$d0' } +body: | + bb.1: + liveins: $d0 + + ; CHECK-LABEL: name: testmsxd + ; CHECK: liveins: $d0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK-NEXT: [[INTRINSIC_LLRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LLRINT [[COPY]](s64) + ; CHECK-NEXT: $x0 = COPY [[INTRINSIC_LLRINT]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(s64) = COPY $d0 + %1:_(s64) = G_INTRINSIC_LLRINT %0(s64) + $x0 = COPY %1(s64) + RET_ReallyLR implicit $x0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 0793f3983c8e57..098726b0a980df 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -156,6 +156,10 @@ # DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 2, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_INTRINSIC_LLRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK # DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected diff --git a/llvm/test/CodeGen/AArch64/llrint-conv-fp16.ll b/llvm/test/CodeGen/AArch64/llrint-conv-fp16.ll index 1adbbab76abf52..7e28c863b07a9a 100644 --- a/llvm/test/CodeGen/AArch64/llrint-conv-fp16.ll +++ b/llvm/test/CodeGen/AArch64/llrint-conv-fp16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16 +; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s --check-prefixes=CHECK-NOFP16 +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK-FP16 define i16 @testmhhs(half %x) { ; CHECK-NOFP16-LABEL: testmhhs: diff --git a/llvm/test/CodeGen/AArch64/llrint-conv.ll b/llvm/test/CodeGen/AArch64/llrint-conv.ll index fa11b007eeb3dd..3a6396d120f791 100644 --- a/llvm/test/CodeGen/AArch64/llrint-conv.ll +++ b/llvm/test/CodeGen/AArch64/llrint-conv.ll @@ -1,59 +1,75 @@ -; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s -; CHECK-LABEL: testmsws: -; CHECK: frintx [[REG:s[0-9]]], s0 -; CHECK-NEXT: fcvtzs x0, [[REG]] -; CHECK: ret define i32 @testmsws(float %x) { +; CHECK-LABEL: testmsws: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintx s0, s0 +; CHECK-NEXT: fcvtzs x0, s0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret entry: %0 = tail call i64 @llvm.llrint.f32(float %x) %conv = trunc i64 %0 to i32 ret i32 %conv } -; CHECK-LABEL: testmsxs: -; CHECK: frintx [[REG:s[0-9]]], s0 -; CHECK-NEXT: fcvtzs x0, [[REG]] -; CHECK-NEXT: ret define i64 @testmsxs(float %x) { +; CHECK-LABEL: testmsxs: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintx s0, s0 +; CHECK-NEXT: fcvtzs x0, s0 +; CHECK-NEXT: ret entry: %0 = tail call i64 @llvm.llrint.f32(float %x) ret i64 %0 } -; CHECK-LABEL: testmswd: -; CHECK: frintx [[REG:d[0-9]]], d0 -; CHECK-NEXT: fcvtzs x0, [[REG]] -; CHECK: ret define i32 @testmswd(double %x) { +; CHECK-LABEL: testmswd: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintx d0, d0 +; CHECK-NEXT: fcvtzs x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret entry: %0 = tail call i64 @llvm.llrint.f64(double %x) %conv = trunc i64 %0 to i32 ret i32 %conv } -; CHECK-LABEL: testmsxd: -; CHECK: frintx [[REG:d[0-9]]], d0 -; CHECK-NEXT: fcvtzs x0, [[REG]] -; CHECK-nEXT: ret define i64 @testmsxd(double %x) { +; CHECK-LABEL: testmsxd: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintx d0, d0 +; CHECK-NEXT: fcvtzs x0, d0 +; CHECK-NEXT: ret entry: %0 = tail call i64 @llvm.llrint.f64(double %x) ret i64 %0 } -; CHECK-LABEL: testmswl: -; CHECK: bl llrintl define i32 @testmswl(fp128 %x) { +; CHECK-LABEL: testmswl: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl llrintl +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret entry: %0 = tail call i64 @llvm.llrint.f128(fp128 %x) %conv = trunc i64 %0 to i32 ret i32 %conv } -; CHECK-LABEL: testmsll: -; CHECK: b llrintl define i64 @testmsll(fp128 %x) { +; CHECK-LABEL: testmsll: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: b llrintl entry: %0 = tail call i64 @llvm.llrint.f128(fp128 %x) ret i64 %0 diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td index 40a831d7e9e8f6..ebb95ccb210408 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td @@ -28,11 +28,11 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(69), GIMT_Encode2(186), /*)*//*default:*//*Label 2*/ GIMT_Encode4(562), -// CHECK-NEXT: /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(478), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_FNEG*//*Label 1*/ GIMT_Encode4(530), -// CHECK-NEXT: // Label 0: @478 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4(529), // Rule ID 1 // +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2({{[0-9]+}}), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 2*/ GIMT_Encode4([[L562:[0-9]+]]), +// CHECK-NEXT: /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4([[L478:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_FNEG*//*Label 1*/ GIMT_Encode4([[L530:[0-9]+]]), +// CHECK-NEXT: // Label 0: @[[L478]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4([[L529:[0-9]+]]), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, // CHECK-NEXT: // MIs[0] a @@ -57,10 +57,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_ReplaceRegWithTempReg, /*OldInsnID*/0, /*OldOpIdx*/1, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 3: @529 +// CHECK-NEXT: // Label 3: @[[L529]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @530 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(561), // Rule ID 0 // +// CHECK-NEXT: // Label 1: @[[L530]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4([[L561:[0-9]+]]), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: // MIs[0] dst // CHECK-NEXT: // No operand predicates @@ -75,10 +75,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_ReplaceReg, /*OldInsnID*/0, /*OldOpIdx*/0, /*NewInsnId*/1, /*NewOpIdx*/1, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 4: @561 +// CHECK-NEXT: // Label 4: @[[L561]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @562 +// CHECK-NEXT: // Label 2: @[[L562]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: }; // Size: 563 bytes +// CHECK-NEXT: }; // Size: {{[0-9]+}} bytes // CHECK-NEXT: return MatchTable0; // CHECK-NEXT: } diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td index 751b1318ecc01f..6004a17d351be7 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td @@ -34,12 +34,12 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(132), /*)*//*default:*//*Label 3*/ GIMT_Encode4(579), -// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(462), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4(493), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(539), -// CHECK-NEXT: // Label 0: @462 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(492), // Rule ID 0 // +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 3*/ GIMT_Encode4([[L579:[0-9]+]]), +// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4([[L462:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4([[L493:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4({{[0-9]+}}), +// CHECK-NEXT: // Label 0: @[[L462]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4([[L492:[0-9]+]]), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[0] a @@ -51,10 +51,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddImm8, /*InsnID*/0, /*Imm*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 4: @492 +// CHECK-NEXT: // Label 4: @[[L492]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @493 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(538), // Rule ID 2 // +// CHECK-NEXT: // Label 1: @[[L493]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4([[L538:[0-9]+]]), // Rule ID 2 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled), // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[0] a @@ -66,10 +66,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddCImm, /*InsnID*/0, /*Type*/GILLT_s32, /*Imm*/GIMT_Encode8(42), // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 5: @538 +// CHECK-NEXT: // Label 5: @[[L538]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @539 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(578), // Rule ID 1 // +// CHECK-NEXT: // Label 2: @{{[0-9]+}} +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4([[L578:[0-9]+]]), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // MIs[0] a // CHECK-NEXT: // No operand predicates @@ -83,10 +83,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 6: @578 +// CHECK-NEXT: // Label 6: @[[L578]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 3: @579 +// CHECK-NEXT: // Label 3: @[[L579]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: }; // Size: 580 bytes +// CHECK-NEXT: }; // Size: {{[0-9]+}} bytes // CHECK-NEXT: return MatchTable0; // CHECK-NEXT: } diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td index e8e6d3e74f4024..b2dd8b6684b1d3 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td @@ -29,11 +29,11 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(119), GIMT_Encode2(121), /*)*//*default:*//*Label 2*/ GIMT_Encode4(132), +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2({{[0-9]+}}), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 2*/ GIMT_Encode4([[L132:[0-9]+]]), // CHECK-NEXT: /*TargetOpcode::G_INTRINSIC*//*Label 0*/ GIMT_Encode4(18), -// CHECK-NEXT: /*TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS*//*Label 1*/ GIMT_Encode4(73), +// CHECK-NEXT: /*TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS*//*Label 1*/ GIMT_Encode4([[L73:[0-9]+]]), // CHECK-NEXT: // Label 0: @18 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4(72), // Rule ID 0 // +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4([[L72:[0-9]+]]), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, // CHECK-NEXT: GIM_CheckIntrinsicID, /*MI*/0, /*Op*/1, GIMT_Encode2(Intrinsic::1in_1out), @@ -52,10 +52,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/1, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 3: @72 +// CHECK-NEXT: // Label 3: @[[L72]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @73 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(131), // Rule ID 1 // +// CHECK-NEXT: // Label 1: @[[L73]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4([[L131:[0-9]+]]), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: GIM_CheckNumOperands, /*MI*/0, /*Expected*/3, // CHECK-NEXT: GIM_CheckIntrinsicID, /*MI*/0, /*Op*/1, GIMT_Encode2(Intrinsic::sideeffects_1in_1out), @@ -76,9 +76,9 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_MergeMemOperands, /*InsnID*/1, /*NumInsns*/1, /*MergeInsnID's*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 4: @131 +// CHECK-NEXT: // Label 4: @[[L131]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @132 +// CHECK-NEXT: // Label 2: @[[L132]] // CHECK-NEXT: GIM_Reject, // CHECK-NEXT: }; // Size: 133 bytes // CHECK-NEXT: return MatchTable0; diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td index 26a0ec6235e309..016ab05ca01e4e 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td @@ -28,12 +28,12 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(124), GIMT_Encode2(187), /*)*//*default:*//*Label 3*/ GIMT_Encode4(380), -// CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 0*/ GIMT_Encode4(262), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 1*/ GIMT_Encode4(298), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_FPEXT*//*Label 2*/ GIMT_Encode4(344), -// CHECK-NEXT: // Label 0: @262 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(297), // Rule ID 1 // +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2({{[0-9]+}}), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 3*/ GIMT_Encode4([[L380:[0-9]+]]), +// CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 0*/ GIMT_Encode4([[L262:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 1*/ GIMT_Encode4([[L298:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_FPEXT*//*Label 2*/ GIMT_Encode4([[L344:[0-9]+]]), +// CHECK-NEXT: // Label 0: @[[L262]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4([[L297:[0-9]+]]), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: // MIs[0] root // CHECK-NEXT: // No operand predicates @@ -47,10 +47,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 4: @297 +// CHECK-NEXT: // Label 4: @[[L297]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @298 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(343), // Rule ID 0 // +// CHECK-NEXT: // Label 1: @[[L298]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4([[L343:[0-9]+]]), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: // MIs[0] root // CHECK-NEXT: // No operand predicates @@ -68,10 +68,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 5: @343 +// CHECK-NEXT: // Label 5: @[[L343]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @344 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(379), // Rule ID 2 // +// CHECK-NEXT: // Label 2: @[[L344]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4([[L379:[0-9]+]]), // Rule ID 2 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: // MIs[0] root // CHECK-NEXT: // No operand predicates @@ -85,10 +85,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 6: @379 +// CHECK-NEXT: // Label 6: @[[L379]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 3: @380 +// CHECK-NEXT: // Label 3: @[[L380]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: }; // Size: 381 bytes +// CHECK-NEXT: }; // Size: {{[0-9]+}} bytes // CHECK-NEXT: return MatchTable0; // CHECK-NEXT: } diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td index 0189d3d056fc06..02085c1fd2666b 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td @@ -135,15 +135,15 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // Verify match table. // CHECK: const uint8_t *GenMyCombiner::getMatchTable() const { // CHECK-NEXT: constexpr static uint8_t MatchTable0[] = { -// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(132), /*)*//*default:*//*Label 6*/ GIMT_Encode4(677), -// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(462), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(504), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(557), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(599), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(624), GIMT_Encode4(0), -// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(637), -// CHECK-NEXT: // Label 0: @462 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(491), // Rule ID 4 // +// CHECK-NEXT: GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 6*/ GIMT_Encode4([[L677:[0-9]+]]), +// CHECK-NEXT: /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4([[L462:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4([[L504:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4([[L557:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4([[L599:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4([[L624:[0-9]+]]), GIMT_Encode4(0), +// CHECK-NEXT: /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4([[L637:[0-9]+]]), +// CHECK-NEXT: // Label 0: @[[L462]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4([[L491:[0-9]+]]), // Rule ID 4 // // CHECK-NEXT: GIM_CheckFeatures, GIMT_Encode2(GIFBS_HasAnswerToEverything), // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled), // CHECK-NEXT: // MIs[0] a @@ -158,8 +158,8 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: // Combiner Rule #3: InstTest1 // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 7: @491 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(503), // Rule ID 3 // +// CHECK-NEXT: // Label 7: @[[L491]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4([[L503:[0-9]+]]), // Rule ID 3 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled), // CHECK-NEXT: // MIs[0] a // CHECK-NEXT: // No operand predicates @@ -168,10 +168,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: // Combiner Rule #2: InstTest0 // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner1), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 8: @503 +// CHECK-NEXT: // Label 8: @[[L503]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 1: @504 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(556), // Rule ID 6 // +// CHECK-NEXT: // Label 1: @[[L504]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4([[L556:[0-9]+]]), // Rule ID 6 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled), // CHECK-NEXT: GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32, // CHECK-NEXT: // MIs[0] dst @@ -189,10 +189,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 9: @556 +// CHECK-NEXT: // Label 9: @[[L556]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 2: @557 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(598), // Rule ID 5 // +// CHECK-NEXT: // Label 2: @[[L557]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4([[L598:[0-9]+]]), // Rule ID 5 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled), // CHECK-NEXT: // MIs[0] tmp // CHECK-NEXT: GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1] @@ -210,32 +210,32 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner2), // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 10: @598 +// CHECK-NEXT: // Label 10: @[[L598]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 3: @599 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(611), // Rule ID 0 // +// CHECK-NEXT: // Label 3: @[[L599]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4([[L611:[0-9]+]]), // Rule ID 0 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled), // CHECK-NEXT: // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC' // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 11: @611 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(623), // Rule ID 1 // +// CHECK-NEXT: // Label 11: @[[L611]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4([[L623:[0-9]+]]), // Rule ID 1 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC' // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 12: @623 +// CHECK-NEXT: // Label 12: @[[L623]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 4: @624 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(636), // Rule ID 2 // +// CHECK-NEXT: // Label 4: @[[L624]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4([[L636:[0-9]+]]), // Rule ID 2 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled), // CHECK-NEXT: // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT' // CHECK-NEXT: GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0), // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 13: @636 +// CHECK-NEXT: // Label 13: @[[L636]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 5: @637 -// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(676), // Rule ID 7 // +// CHECK-NEXT: // Label 5: @[[L637]] +// CHECK-NEXT: GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4([[L676:[0-9]+]]), // Rule ID 7 // // CHECK-NEXT: GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule6Enabled), // CHECK-NEXT: // MIs[0] dst // CHECK-NEXT: // No operand predicates @@ -250,10 +250,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0, // CHECK-NEXT: GIR_EraseFromParent, /*InsnID*/0, // CHECK-NEXT: GIR_Done, -// CHECK-NEXT: // Label 14: @676 +// CHECK-NEXT: // Label 14: @[[L676]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: // Label 6: @677 +// CHECK-NEXT: // Label 6: @[[L677]] // CHECK-NEXT: GIM_Reject, -// CHECK-NEXT: }; // Size: 678 bytes +// CHECK-NEXT: }; // Size: {{[0-9]+}} bytes // CHECK-NEXT: return MatchTable0; // CHECK-NEXT: } diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td index f79b792b37a36c..82ecc4495e80ac 100644 --- a/llvm/test/TableGen/GlobalISelEmitter.td +++ b/llvm/test/TableGen/GlobalISelEmitter.td @@ -518,7 +518,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3), // R00O-NEXT: GIM_Reject, // R00O: // Label [[DEFAULT_NUM]]: @[[DEFAULT]] // R00O-NEXT: GIM_Reject, -// R00O-NEXT: }; // Size: 2023 bytes +// R00O-NEXT: }; // Size: 2027 bytes def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4), [(set GPR32:$dst, From 2c22a0c16d1cb844eac142156ba67098627a336c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 12 Apr 2024 12:35:40 -0700 Subject: [PATCH 287/300] [InstCombine] Add test case for turning sub into xor using dominating condition. NFC I plan to disable using dominating conditions for turning sub into xor, but first we need that demonstrates it currently happens. --- llvm/test/Transforms/InstCombine/sub-xor.ll | 23 +++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/sub-xor.ll b/llvm/test/Transforms/InstCombine/sub-xor.ll index 71da73d51ae37e..b4e87d0405fc48 100644 --- a/llvm/test/Transforms/InstCombine/sub-xor.ll +++ b/llvm/test/Transforms/InstCombine/sub-xor.ll @@ -157,3 +157,26 @@ define <2 x i8> @xor_add_splat_undef(<2 x i8> %x) { %add = add <2 x i8> %xor, ret <2 x i8> %add } + +define i32 @xor_dominating_cond(i32 %x) { +; CHECK-LABEL: @xor_dominating_cond( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[X:%.*]], 256 +; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[A:%.*]] = xor i32 [[X]], 255 +; CHECK-NEXT: ret i32 [[A]] +; CHECK: if.end: +; CHECK-NEXT: ret i32 [[X]] +; +entry: + %cond = icmp ult i32 %x, 256 + br i1 %cond, label %if.then, label %if.end + +if.then: + %a = sub i32 255, %x + ret i32 %a + +if.end: + ret i32 %x +} From 421a8c5892b7e59f27b2c21452f81fa789a758fd Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 16 Apr 2024 21:25:28 -0700 Subject: [PATCH 288/300] [InstCombine] Add phase ordering test for #88239. NFC --- .../Transforms/PhaseOrdering/X86/pr88239.ll | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll new file mode 100644 index 00000000000000..3afa1904fb249a --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes="default" -mcpu=skx -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @foo(ptr noalias noundef %0, ptr noalias noundef %1) optsize { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: ptr noalias nocapture noundef readonly [[TMP0:%.*]], ptr noalias nocapture noundef writeonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: br label [[TMP4:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[TMP4]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[TMP2]] ], [ [[VEC_IND_NEXT:%.*]], [[TMP4]] ] +; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i64> [[TMP6]], +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], <8 x i64> [[TMP3]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = tail call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER]], +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store <8 x i32> [[TMP5]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 256 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[TMP4]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: ret void +; + br label %3 + +3: ; preds = %7, %2 + %4 = phi i32 [ 0, %2 ], [ %15, %7 ] + %5 = icmp slt i32 %4, 256 + br i1 %5, label %7, label %6 + +6: ; preds = %3 + ret void + +7: ; preds = %3 + %8 = sub nsw i32 255, %4 + %9 = zext nneg i32 %8 to i64 + %10 = getelementptr inbounds i32, ptr %0, i64 %9 + %11 = load i32, ptr %10, align 4 + %12 = add nsw i32 %11, 5 + %13 = sext i32 %4 to i64 + %14 = getelementptr inbounds i32, ptr %1, i64 %13 + store i32 %12, ptr %14, align 4 + %15 = add nsw i32 %4, 1 + br label %3 +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +;. From ed741ffe893698cd14c6785ac2ee7031d9d344a6 Mon Sep 17 00:00:00 2001 From: Nathan Lanza Date: Wed, 17 Apr 2024 14:06:01 -0400 Subject: [PATCH 289/300] [github] Add ClangIR to new-prs-labeler.yml (#86088) --- .github/new-prs-labeler.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index a0428336d300f9..1502d64a7d3e3e 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -1,3 +1,9 @@ +ClangIR: + - clang/include/clang/CIR/**/* + - clang/lib/CIR/**/* + - clang/tools/cir-*/**/* + - clang/test/CIR/**/* + clang:dataflow: - clang/include/clang/Analysis/FlowSensitive/**/* - clang/lib/Analysis/FlowSensitive/**/* From c02ed29ec151d1d555c3735efef2ab215126ddbf Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 17 Apr 2024 19:01:44 +0100 Subject: [PATCH 290/300] [CostModel][X86] Recognise vector rotation by uniform constant patterns Adds suitable costs for AVX512 targets (we still rely on default expansion for AVX2 and earlier) --- .../lib/Target/X86/X86TargetTransformInfo.cpp | 41 +++++++++++++--- .../Analysis/CostModel/X86/fshl-codesize.ll | 24 +++++----- .../Analysis/CostModel/X86/fshl-latency.ll | 24 +++++----- .../CostModel/X86/fshl-sizelatency.ll | 24 +++++----- llvm/test/Analysis/CostModel/X86/fshl.ll | 22 ++++----- .../Analysis/CostModel/X86/fshr-codesize.ll | 48 +++++++++---------- .../Analysis/CostModel/X86/fshr-latency.ll | 24 +++++----- .../CostModel/X86/fshr-sizelatency.ll | 48 +++++++++---------- llvm/test/Analysis/CostModel/X86/fshr.ll | 22 ++++----- .../SLPVectorizer/X86/arith-fshl-rot.ll | 40 ++++++++++------ .../SLPVectorizer/X86/arith-fshr-rot.ll | 40 ++++++++++------ 11 files changed, 204 insertions(+), 153 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 38064f97926992..d111c4d4ecc1ae 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3402,6 +3402,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } }, { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } }, { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } }, + { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } }, + { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } }, + { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } }, }; static const CostKindTblEntry AVX512BITALGCostTbl[] = { { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } }, @@ -3498,6 +3501,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } }, { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } }, { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } }, + { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } }, + { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } }, + { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } }, + { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } }, + { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } }, + { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } }, { ISD::SADDSAT, MVT::v32i16, { 1 } }, { ISD::SADDSAT, MVT::v64i8, { 1 } }, { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } }, @@ -3556,6 +3565,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } }, { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } }, { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } }, + { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } }, + { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } }, + { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } }, + { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } }, + { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } }, + { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } }, { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } }, { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } }, { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } }, @@ -3642,7 +3657,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } }, { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } }, { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } }, - { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } } + { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } }, + { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } }, + { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } }, + { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } }, + { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } }, + { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } }, + { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } }, + { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } }, + { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } }, }; static const CostKindTblEntry AVX2CostTbl[] = { { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) @@ -4096,9 +4119,11 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, const SmallVectorImpl &Args = ICA.getArgs(); if (Args[0] == Args[1]) { ISD = ISD::ROTL; - // Handle scalar constant rotation amounts. - // TODO: Handle vector + funnel-shift cases. - if (isa_and_nonnull(Args[2])) + // Handle uniform constant rotation amounts. + // TODO: Handle funnel-shift cases. + const APInt *Amt; + if (Args[2] && + PatternMatch::match(Args[2], PatternMatch::m_APIntAllowUndef(Amt))) ISD = X86ISD::VROTLI; } } @@ -4109,10 +4134,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (!ICA.isTypeBasedOnly()) { const SmallVectorImpl &Args = ICA.getArgs(); if (Args[0] == Args[1]) { - // Handle scalar constant rotation amount. - // TODO: Handle vector + funnel-shift cases. ISD = ISD::ROTR; - if (isa_and_nonnull(Args[2])) + // Handle uniform constant rotation amount. + // TODO: Handle funnel-shift cases. + const APInt *Amt; + if (Args[2] && + PatternMatch::match(Args[2], PatternMatch::m_APIntAllowUndef(Amt))) ISD = X86ISD::VROTLI; } } diff --git a/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll b/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll index c46e32ffb4ad3a..a7585a4d9f39e1 100644 --- a/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll @@ -2744,9 +2744,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; ; AVX512BW-LABEL: 'splatconstant_rotate_i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i16' @@ -2829,9 +2829,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512BW-LABEL: 'splatconstant_rotate_i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i8' @@ -2843,9 +2843,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8' ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i8' @@ -2871,9 +2871,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) diff --git a/llvm/test/Analysis/CostModel/X86/fshl-latency.ll b/llvm/test/Analysis/CostModel/X86/fshl-latency.ll index fa32497c63ec7e..7105f713fdc349 100644 --- a/llvm/test/Analysis/CostModel/X86/fshl-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/fshl-latency.ll @@ -2696,9 +2696,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; ; AVX512BW-LABEL: 'splatconstant_rotate_i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i16' @@ -2781,9 +2781,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512BW-LABEL: 'splatconstant_rotate_i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i8' @@ -2795,9 +2795,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8' ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i8' @@ -2823,9 +2823,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) diff --git a/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll index 832a574a9b332a..5d7361e2931769 100644 --- a/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll @@ -2984,9 +2984,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; ; AVX512BW-LABEL: 'splatconstant_rotate_i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i16' @@ -3069,9 +3069,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512BW-LABEL: 'splatconstant_rotate_i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i8' @@ -3083,9 +3083,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8' ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i8' @@ -3111,9 +3111,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) diff --git a/llvm/test/Analysis/CostModel/X86/fshl.ll b/llvm/test/Analysis/CostModel/X86/fshl.ll index 311d8d5ed7d2a9..1cbdab09acd909 100644 --- a/llvm/test/Analysis/CostModel/X86/fshl.ll +++ b/llvm/test/Analysis/CostModel/X86/fshl.ll @@ -2682,8 +2682,8 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; ; AVX512BW-LABEL: 'splatconstant_rotate_i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -2767,9 +2767,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512BW-LABEL: 'splatconstant_rotate_i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i8' @@ -2781,9 +2781,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8' ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i8' @@ -2809,9 +2809,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3) diff --git a/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll b/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll index f9d30e4ced3ec9..ecc861dd7f8eed 100644 --- a/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll @@ -2644,9 +2644,9 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; ; XOP-LABEL: 'splatconstant_rotate_i64' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) -; XOP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) @@ -2701,9 +2701,9 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; ; XOP-LABEL: 'splatconstant_rotate_i32' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) -; XOP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) @@ -2744,9 +2744,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; ; AVX512BW-LABEL: 'splatconstant_rotate_i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i16' @@ -2779,9 +2779,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; ; XOP-LABEL: 'splatconstant_rotate_i16' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) -; XOP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i16' @@ -2829,9 +2829,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512BW-LABEL: 'splatconstant_rotate_i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i8' @@ -2843,9 +2843,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8' ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i8' @@ -2864,16 +2864,16 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; XOP-LABEL: 'splatconstant_rotate_i8' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; XOP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) diff --git a/llvm/test/Analysis/CostModel/X86/fshr-latency.ll b/llvm/test/Analysis/CostModel/X86/fshr-latency.ll index ed2227591847a6..0142ad77849cac 100644 --- a/llvm/test/Analysis/CostModel/X86/fshr-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/fshr-latency.ll @@ -2696,9 +2696,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; ; AVX512BW-LABEL: 'splatconstant_rotate_i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i16' @@ -2781,9 +2781,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512BW-LABEL: 'splatconstant_rotate_i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i8' @@ -2795,9 +2795,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8' ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i8' @@ -2823,9 +2823,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) diff --git a/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll index 8931781f70bdce..6dafb20a0aeed0 100644 --- a/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll @@ -2849,9 +2849,9 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256 ; ; XOP-LABEL: 'splatconstant_rotate_i64' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7) -; XOP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i64' @@ -2934,9 +2934,9 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256 ; ; XOP-LABEL: 'splatconstant_rotate_i32' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5) -; XOP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i32' @@ -2984,9 +2984,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; ; AVX512BW-LABEL: 'splatconstant_rotate_i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i16' @@ -3019,9 +3019,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; ; XOP-LABEL: 'splatconstant_rotate_i16' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) -; XOP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i16' @@ -3069,9 +3069,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512BW-LABEL: 'splatconstant_rotate_i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i8' @@ -3083,9 +3083,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8' ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i8' @@ -3104,16 +3104,16 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; XOP-LABEL: 'splatconstant_rotate_i8' ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; XOP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; XOP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; XOP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; XOP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) diff --git a/llvm/test/Analysis/CostModel/X86/fshr.ll b/llvm/test/Analysis/CostModel/X86/fshr.ll index ca9ddcc52938d7..ada1b9c5bdc4bf 100644 --- a/llvm/test/Analysis/CostModel/X86/fshr.ll +++ b/llvm/test/Analysis/CostModel/X86/fshr.ll @@ -2682,8 +2682,8 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25 ; ; AVX512BW-LABEL: 'splatconstant_rotate_i16' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -2767,9 +2767,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512BW-LABEL: 'splatconstant_rotate_i8' ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512DQ-LABEL: 'splatconstant_rotate_i8' @@ -2781,9 +2781,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8' ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512VBMI2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SLM-LABEL: 'splatconstant_rotate_i8' @@ -2809,9 +2809,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, < ; ; AVX512GFNI-LABEL: 'splatconstant_rotate_i8' ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) -; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> ) +; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> ) ; AVX512GFNI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll index 639aa0a1c6a2c5..9b8480cd0088a3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll @@ -834,22 +834,34 @@ define void @fshl_v2i32_uniformconst() { ; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @fshl_v2i32_uniformconst( -; AVX-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX-NEXT: ret void +; AVX1-LABEL: @fshl_v2i32_uniformconst( +; AVX1-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; AVX1-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) +; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) +; AVX1-NEXT: store i32 [[R0]], ptr @d32, align 4 +; AVX1-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @fshl_v2i32_uniformconst( +; AVX2-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; AVX2-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; AVX2-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) +; AVX2-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) +; AVX2-NEXT: store i32 [[R0]], ptr @d32, align 4 +; AVX2-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; AVX2-NEXT: ret void +; +; AVX256-LABEL: @fshl_v2i32_uniformconst( +; AVX256-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 +; AVX256-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> ) +; AVX256-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @fshl_v2i32_uniformconst( -; AVX512-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX512-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX512-NEXT: [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX512-NEXT: [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX512-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX512-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> ) +; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 ; AVX512-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll index c557c9647551a9..f3e73d0e6840e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll @@ -834,22 +834,34 @@ define void @fshr_v2i32_uniformconst() { ; SSE-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @fshr_v2i32_uniformconst( -; AVX-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 -; AVX-NEXT: ret void +; AVX1-LABEL: @fshr_v2i32_uniformconst( +; AVX1-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; AVX1-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; AVX1-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) +; AVX1-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) +; AVX1-NEXT: store i32 [[R0]], ptr @d32, align 4 +; AVX1-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @fshr_v2i32_uniformconst( +; AVX2-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 +; AVX2-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 +; AVX2-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) +; AVX2-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) +; AVX2-NEXT: store i32 [[R0]], ptr @d32, align 4 +; AVX2-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; AVX2-NEXT: ret void +; +; AVX256-LABEL: @fshr_v2i32_uniformconst( +; AVX256-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 +; AVX256-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> ) +; AVX256-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 +; AVX256-NEXT: ret void ; ; AVX512-LABEL: @fshr_v2i32_uniformconst( -; AVX512-NEXT: [[A0:%.*]] = load i32, ptr @a32, align 4 -; AVX512-NEXT: [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4 -; AVX512-NEXT: [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1) -; AVX512-NEXT: [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1) -; AVX512-NEXT: store i32 [[R0]], ptr @d32, align 4 -; AVX512-NEXT: store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4 +; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> ) +; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4 ; AVX512-NEXT: ret void ; %a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4 From 58a08e154c804051aaca9151a8053aea3ec15646 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 17 Apr 2024 11:15:25 -0700 Subject: [PATCH 291/300] [RISCV] Add coverage for strength reduction of mul by small negative immediates --- llvm/test/CodeGen/RISCV/rv32zba.ll | 93 ++++++++++++++++++++++++++++++ llvm/test/CodeGen/RISCV/rv64zba.ll | 93 ++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rv32zba.ll b/llvm/test/CodeGen/RISCV/rv32zba.ll index cc632a09c8054b..a78f823d318418 100644 --- a/llvm/test/CodeGen/RISCV/rv32zba.ll +++ b/llvm/test/CodeGen/RISCV/rv32zba.ll @@ -645,3 +645,96 @@ define i32 @addshl_5_8(i32 %a, i32 %b) { %e = add i32 %c, %d ret i32 %e } + +define i32 @mul_neg1(i32 %a) { +; CHECK-LABEL: mul_neg1: +; CHECK: # %bb.0: +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i32 %a, -1 + ret i32 %c +} + +define i32 @mul_neg2(i32 %a) { +; CHECK-LABEL: mul_neg2: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i32 %a, -2 + ret i32 %c +} + +define i32 @mul_neg3(i32 %a) { +; RV32I-LABEL: mul_neg3: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 1 +; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBA-LABEL: mul_neg3: +; RV32ZBA: # %bb.0: +; RV32ZBA-NEXT: sh1add a0, a0, a0 +; RV32ZBA-NEXT: neg a0, a0 +; RV32ZBA-NEXT: ret + %c = mul i32 %a, -3 + ret i32 %c +} + +define i32 @mul_neg4(i32 %a) { +; CHECK-LABEL: mul_neg4: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i32 %a, -4 + ret i32 %c +} + +define i32 @mul_neg5(i32 %a) { +; RV32I-LABEL: mul_neg5: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a0, 2 +; RV32I-NEXT: neg a0, a0 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBA-LABEL: mul_neg5: +; RV32ZBA: # %bb.0: +; RV32ZBA-NEXT: sh2add a0, a0, a0 +; RV32ZBA-NEXT: neg a0, a0 +; RV32ZBA-NEXT: ret + %c = mul i32 %a, -5 + ret i32 %c +} + +define i32 @mul_neg6(i32 %a) { +; CHECK-LABEL: mul_neg6: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i32 %a, -6 + ret i32 %c +} + +define i32 @mul_neg7(i32 %a) { +; CHECK-LABEL: mul_neg7: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i32 %a, -7 + ret i32 %c +} + +define i32 @mul_neg8(i32 %a) { +; CHECK-LABEL: mul_neg8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i32 %a, -8 + ret i32 %c +} diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index b4c80b60e0bad5..6939185947f443 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -2533,3 +2533,96 @@ define i64 @regression(i32 signext %x, i32 signext %y) { %res = mul nuw nsw i64 %ext, 24 ret i64 %res } + +define i64 @mul_neg1(i64 %a) { +; CHECK-LABEL: mul_neg1: +; CHECK: # %bb.0: +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i64 %a, -1 + ret i64 %c +} + +define i64 @mul_neg2(i64 %a) { +; CHECK-LABEL: mul_neg2: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i64 %a, -2 + ret i64 %c +} + +define i64 @mul_neg3(i64 %a) { +; RV64I-LABEL: mul_neg3: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 1 +; RV64I-NEXT: neg a0, a0 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul_neg3: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh1add a0, a0, a0 +; RV64ZBA-NEXT: neg a0, a0 +; RV64ZBA-NEXT: ret + %c = mul i64 %a, -3 + ret i64 %c +} + +define i64 @mul_neg4(i64 %a) { +; CHECK-LABEL: mul_neg4: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i64 %a, -4 + ret i64 %c +} + +define i64 @mul_neg5(i64 %a) { +; RV64I-LABEL: mul_neg5: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a1, a0, 2 +; RV64I-NEXT: neg a0, a0 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBA-LABEL: mul_neg5: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: sh2add a0, a0, a0 +; RV64ZBA-NEXT: neg a0, a0 +; RV64ZBA-NEXT: ret + %c = mul i64 %a, -5 + ret i64 %c +} + +define i64 @mul_neg6(i64 %a) { +; CHECK-LABEL: mul_neg6: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, -6 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, -6 + ret i64 %c +} + +define i64 @mul_neg7(i64 %a) { +; CHECK-LABEL: mul_neg7: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, -7 + ret i64 %c +} + +define i64 @mul_neg8(i64 %a) { +; CHECK-LABEL: mul_neg8: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %c = mul i64 %a, -8 + ret i64 %c +} From cc82f1290a1e2157a6c0530d78d8cc84d2b8553d Mon Sep 17 00:00:00 2001 From: Usman Nadeem Date: Wed, 17 Apr 2024 11:42:52 -0700 Subject: [PATCH 292/300] [AArch64] Update latencies for Cortex-A510 scheduling model (#87293) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updated according to the Software Optimization Guide for Arm® Cortex®‑A510 Core Revision: r1p3 Issue 6.0. --- llvm/lib/Target/AArch64/AArch64SchedA510.td | 145 +- .../AArch64/GlobalISel/combine-udiv.ll | 8 +- llvm/test/CodeGen/AArch64/aarch64-addv.ll | 10 +- .../AArch64/aarch64-dup-ext-scalable.ll | 40 +- llvm/test/CodeGen/AArch64/aarch64-smull.ll | 2 +- llvm/test/CodeGen/AArch64/active_lane_mask.ll | 169 +- .../CodeGen/AArch64/arm64-convert-v4f64.ll | 14 +- llvm/test/CodeGen/AArch64/arm64-vabs.ll | 106 +- llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll | 28 +- llvm/test/CodeGen/AArch64/arm64-vhadd.ll | 8 +- ...interleaving-add-mull-scalable-contract.ll | 64 +- ...x-deinterleaving-add-mull-scalable-fast.ll | 63 +- ...complex-deinterleaving-f16-mul-scalable.ll | 23 +- ...complex-deinterleaving-f32-mul-scalable.ll | 18 +- ...complex-deinterleaving-f64-mul-scalable.ll | 18 +- ...complex-deinterleaving-i16-mul-scalable.ll | 3 +- ...rleaving-reductions-predicated-scalable.ll | 64 +- ...plex-deinterleaving-reductions-scalable.ll | 26 +- .../complex-deinterleaving-splat-scalable.ll | 16 +- .../AArch64/concat_vector-truncate-combine.ll | 2 +- .../AArch64/dag-combine-concat-vectors.ll | 12 +- .../div-rem-pair-recomposition-signed.ll | 6 +- .../div-rem-pair-recomposition-unsigned.ll | 6 +- llvm/test/CodeGen/AArch64/extbinopload.ll | 121 +- llvm/test/CodeGen/AArch64/fcmp.ll | 108 +- llvm/test/CodeGen/AArch64/fdiv-combine.ll | 2 +- .../fold-int-pow2-with-fmul-or-fdiv.ll | 2 +- .../CodeGen/AArch64/fp-veclib-expansion.ll | 8 +- llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll | 8 +- .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 20 +- llvm/test/CodeGen/AArch64/funnel-shift-rot.ll | 4 +- ...st-and-by-const-from-lshr-in-eqcmp-zero.ll | 2 +- llvm/test/CodeGen/AArch64/icmp.ll | 4 +- llvm/test/CodeGen/AArch64/insert-extend.ll | 54 +- .../insert-subvector-res-legalization.ll | 70 +- .../AArch64/intrinsic-cttz-elts-sve.ll | 91 +- llvm/test/CodeGen/AArch64/itofp.ll | 1132 ++++++------ llvm/test/CodeGen/AArch64/ldexp.ll | 6 +- .../CodeGen/AArch64/llvm-ir-to-intrinsic.ll | 43 +- llvm/test/CodeGen/AArch64/load-insert-zero.ll | 28 +- llvm/test/CodeGen/AArch64/logic-shift.ll | 48 +- .../AArch64/named-vector-shuffles-sve.ll | 76 +- llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 758 ++++---- llvm/test/CodeGen/AArch64/neon-extadd.ll | 12 +- llvm/test/CodeGen/AArch64/neon-shift-neg.ll | 24 +- .../CodeGen/AArch64/predicated-add-sub.ll | 22 +- .../AArch64/ragreedy-local-interval-cost.ll | 262 ++- llvm/test/CodeGen/AArch64/rcpc3-sve.ll | 4 +- llvm/test/CodeGen/AArch64/reassocmls.ll | 6 +- llvm/test/CodeGen/AArch64/reduce-shuffle.ll | 356 ++-- llvm/test/CodeGen/AArch64/sat-add.ll | 8 +- llvm/test/CodeGen/AArch64/sext.ll | 36 +- .../CodeGen/AArch64/sink-addsub-of-const.ll | 24 +- ...ate-sm-changing-call-disable-coalescing.ll | 130 +- .../sme-streaming-compatible-interface.ll | 2 +- .../AArch64/sme2-intrinsics-fp-dots.ll | 12 +- .../AArch64/sme2-intrinsics-int-dots.ll | 42 +- .../CodeGen/AArch64/sme2-intrinsics-max.ll | 305 ++- .../CodeGen/AArch64/sme2-intrinsics-min.ll | 163 +- .../CodeGen/AArch64/sme2-intrinsics-mlall.ll | 54 +- .../CodeGen/AArch64/sme2-intrinsics-rshl.ll | 161 +- .../AArch64/sme2-intrinsics-sqdmulh.ll | 89 +- .../CodeGen/AArch64/split-vector-insert.ll | 265 ++- .../CodeGen/AArch64/srem-seteq-vec-splat.ll | 16 +- llvm/test/CodeGen/AArch64/srem-vector-lkk.ll | 4 +- llvm/test/CodeGen/AArch64/sve-abd.ll | 18 +- llvm/test/CodeGen/AArch64/sve-bitcast.ll | 20 +- .../AArch64/sve-calling-convention-mixed.ll | 116 +- llvm/test/CodeGen/AArch64/sve-cmp-folds.ll | 4 +- llvm/test/CodeGen/AArch64/sve-doublereduct.ll | 8 +- llvm/test/CodeGen/AArch64/sve-expand-div.ll | 16 +- .../CodeGen/AArch64/sve-extract-element.ll | 10 +- .../sve-extract-fixed-from-scalable-vector.ll | 8 +- .../AArch64/sve-extract-fixed-vector.ll | 16 +- .../AArch64/sve-extract-scalable-vector.ll | 18 +- llvm/test/CodeGen/AArch64/sve-fcmp.ll | 2 +- llvm/test/CodeGen/AArch64/sve-fcopysign.ll | 18 +- llvm/test/CodeGen/AArch64/sve-fcvt.ll | 64 +- .../sve-fixed-length-addressing-modes.ll | 4 +- .../AArch64/sve-fixed-length-build-vector.ll | 13 +- .../AArch64/sve-fixed-length-concat.ll | 112 +- .../sve-fixed-length-extract-vector-elt.ll | 24 +- .../AArch64/sve-fixed-length-fcopysign.ll | 16 +- .../sve-fixed-length-fp-extend-trunc.ll | 6 +- .../AArch64/sve-fixed-length-fp-select.ll | 234 +-- .../AArch64/sve-fixed-length-fp-to-int.ll | 24 +- .../CodeGen/AArch64/sve-fixed-length-fp128.ll | 8 +- .../sve-fixed-length-frame-offests-crash.ll | 32 +- .../sve-fixed-length-insert-vector-elt.ll | 222 +-- .../AArch64/sve-fixed-length-int-arith.ll | 4 +- .../AArch64/sve-fixed-length-int-div.ll | 56 +- .../AArch64/sve-fixed-length-int-extends.ll | 12 +- .../AArch64/sve-fixed-length-int-rem.ll | 88 +- .../AArch64/sve-fixed-length-int-select.ll | 312 ++-- .../AArch64/sve-fixed-length-int-to-fp.ll | 24 +- .../AArch64/sve-fixed-length-mask-opt.ll | 6 +- .../sve-fixed-length-masked-128bit-loads.ll | 2 +- .../sve-fixed-length-masked-128bit-stores.ll | 18 +- .../AArch64/sve-fixed-length-masked-gather.ll | 148 +- .../AArch64/sve-fixed-length-masked-loads.ll | 32 +- .../sve-fixed-length-masked-scatter.ll | 156 +- .../AArch64/sve-fixed-length-masked-stores.ll | 42 +- .../AArch64/sve-fixed-length-shuffles.ll | 26 +- .../AArch64/sve-fixed-length-splat-vector.ll | 84 +- .../AArch64/sve-fixed-length-trunc-stores.ll | 16 +- .../sve-fixed-length-vector-shuffle-tbl.ll | 6 +- .../CodeGen/AArch64/sve-fp-int-min-max.ll | 2 +- .../test/CodeGen/AArch64/sve-fp-reciprocal.ll | 6 +- .../CodeGen/AArch64/sve-fp-reduce-fadda.ll | 4 +- llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll | 282 +-- llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll | 32 +- .../AArch64/sve-gather-scatter-addr-opts.ll | 28 +- llvm/test/CodeGen/AArch64/sve-hadd.ll | 20 +- .../AArch64/sve-implicit-zero-filling.ll | 8 +- .../CodeGen/AArch64/sve-insert-element.ll | 88 +- .../test/CodeGen/AArch64/sve-insert-vector.ll | 22 +- .../test/CodeGen/AArch64/sve-int-arith-imm.ll | 34 +- llvm/test/CodeGen/AArch64/sve-int-arith.ll | 20 +- llvm/test/CodeGen/AArch64/sve-int-reduce.ll | 62 +- .../CodeGen/AArch64/sve-intrinsics-index.ll | 2 +- .../AArch64/sve-intrinsics-int-arith-imm.ll | 38 +- .../AArch64/sve-intrinsics-logical-imm.ll | 2 +- llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll | 30 +- llvm/test/CodeGen/AArch64/sve-ld1r.ll | 16 +- .../sve-lsr-scaled-index-addressing-mode.ll | 4 +- .../AArch64/sve-masked-gather-legalize.ll | 8 +- .../CodeGen/AArch64/sve-masked-ldst-sext.ll | 2 +- .../CodeGen/AArch64/sve-masked-ldst-zext.ll | 6 +- .../AArch64/sve-masked-scatter-legalize.ll | 20 +- .../CodeGen/AArch64/sve-masked-scatter.ll | 2 +- llvm/test/CodeGen/AArch64/sve-pr62151.ll | 2 +- llvm/test/CodeGen/AArch64/sve-pred-arith.ll | 40 +- .../test/CodeGen/AArch64/sve-pred-selectop.ll | 108 +- .../CodeGen/AArch64/sve-pred-selectop2.ll | 76 +- .../CodeGen/AArch64/sve-pred-selectop3.ll | 28 +- .../AArch64/sve-ptest-removal-cmple.ll | 8 +- .../CodeGen/AArch64/sve-redundant-store.ll | 2 +- .../CodeGen/AArch64/sve-split-extract-elt.ll | 42 +- llvm/test/CodeGen/AArch64/sve-split-fcvt.ll | 26 +- .../CodeGen/AArch64/sve-split-fp-reduce.ll | 2 +- .../CodeGen/AArch64/sve-split-insert-elt.ll | 26 +- .../CodeGen/AArch64/sve-split-int-reduce.ll | 8 +- llvm/test/CodeGen/AArch64/sve-split-load.ll | 4 +- llvm/test/CodeGen/AArch64/sve-split-store.ll | 8 +- .../CodeGen/AArch64/sve-srem-combine-loop.ll | 2 +- .../sve-st1-addressing-mode-reg-imm.ll | 8 +- llvm/test/CodeGen/AArch64/sve-stepvector.ll | 2 +- ...treaming-mode-fixed-length-bit-counting.ll | 36 +- ...e-streaming-mode-fixed-length-ext-loads.ll | 10 +- ...e-streaming-mode-fixed-length-fcopysign.ll | 26 +- ...ve-streaming-mode-fixed-length-fp-arith.ll | 48 +- ...streaming-mode-fixed-length-fp-compares.ll | 44 +- ...-streaming-mode-fixed-length-fp-convert.ll | 4 +- ...aming-mode-fixed-length-fp-extend-trunc.ll | 10 +- .../sve-streaming-mode-fixed-length-fp-fma.ll | 6 +- ...e-streaming-mode-fixed-length-fp-minmax.ll | 24 +- ...e-streaming-mode-fixed-length-fp-reduce.ll | 30 +- ...streaming-mode-fixed-length-fp-rounding.ll | 42 +- ...e-streaming-mode-fixed-length-fp-select.ll | 18 +- ...e-streaming-mode-fixed-length-fp-to-int.ll | 118 +- ...-streaming-mode-fixed-length-fp-vselect.ll | 6 +- ...ing-mode-fixed-length-insert-vector-elt.ll | 42 +- ...e-streaming-mode-fixed-length-int-arith.ll | 16 +- ...treaming-mode-fixed-length-int-compares.ll | 14 +- ...sve-streaming-mode-fixed-length-int-div.ll | 34 +- ...eaming-mode-fixed-length-int-immediates.ll | 8 +- ...-streaming-mode-fixed-length-int-minmax.ll | 32 +- ...ve-streaming-mode-fixed-length-int-mulh.ll | 20 +- ...-streaming-mode-fixed-length-int-reduce.ll | 40 +- ...sve-streaming-mode-fixed-length-int-rem.ll | 64 +- ...-streaming-mode-fixed-length-int-select.ll | 28 +- ...-streaming-mode-fixed-length-int-shifts.ll | 32 +- ...e-streaming-mode-fixed-length-int-to-fp.ll | 56 +- ...streaming-mode-fixed-length-int-vselect.ll | 10 +- ...-streaming-mode-fixed-length-log-reduce.ll | 24 +- ...treaming-mode-fixed-length-masked-store.ll | 16 +- ...eaming-mode-fixed-length-optimize-ptrue.ll | 10 +- ...streaming-mode-fixed-length-permute-rev.ll | 24 +- ...g-mode-fixed-length-permute-zip-uzp-trn.ll | 30 +- .../sve-streaming-mode-fixed-length-ptest.ll | 78 +- .../sve-streaming-mode-fixed-length-rev.ll | 14 +- ...e-streaming-mode-fixed-length-sdiv-pow2.ll | 8 +- ...sve-streaming-mode-fixed-length-shuffle.ll | 2 +- .../sve-streaming-mode-fixed-length-stores.ll | 4 +- .../sve-streaming-mode-fixed-length-trunc.ll | 354 ++-- llvm/test/CodeGen/AArch64/sve-trunc.ll | 36 +- llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll | 20 +- .../sve-uunpklo-load-uzp1-store-combine.ll | 8 +- .../test/CodeGen/AArch64/sve-vecreduce-dot.ll | 2 +- .../CodeGen/AArch64/sve-vecreduce-fold.ll | 2 +- llvm/test/CodeGen/AArch64/sve2-fcopysign.ll | 16 +- .../AArch64/sve2-fixed-length-fcopysign.ll | 16 +- .../AArch64/sve2-intrinsics-combine-rshrnb.ll | 22 +- .../AArch64/sve2-intrinsics-int-arith-imm.ll | 2 +- llvm/test/CodeGen/AArch64/sve2-rsh.ll | 2 +- llvm/test/CodeGen/AArch64/sve2-xar.ll | 2 +- .../AArch64/sve2p1-intrinsics-selx2.ll | 32 +- .../AArch64/sve2p1-intrinsics-selx4.ll | 64 +- .../AArch64/sve2p1-intrinsics-stores.ll | 64 +- llvm/test/CodeGen/AArch64/uadd_sat_vec.ll | 10 +- .../CodeGen/AArch64/urem-seteq-vec-nonzero.ll | 10 +- llvm/test/CodeGen/AArch64/vec_uaddo.ll | 8 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 546 +++--- llvm/test/CodeGen/AArch64/vector-fcopysign.ll | 46 +- llvm/test/CodeGen/AArch64/vector-gep.ll | 4 +- .../test/CodeGen/AArch64/vselect-constants.ll | 5 +- llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 50 +- .../AArch64/Cortex/A510-neon-instructions.s | 376 ++-- .../AArch64/Cortex/A510-sve-instructions.s | 1636 ++++++++--------- 209 files changed, 6488 insertions(+), 6481 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td index 68343674bc819e..94568789461512 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA510.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td @@ -254,7 +254,7 @@ def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>; // Compute pointer authentication code for data address // Compute pointer authentication code, using generic key // Compute pointer authentication code for instruction address -def : InstRW<[CortexA510Write<3, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>; +def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>; // Branch and link, register, with pointer authentication // Branch, register, with pointer authentication @@ -401,30 +401,30 @@ def : InstRW<[CortexA510WriteFPALU_F3], (instrs FCSELHrrr, FCSELSrrr, FCSELDrrr) def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>; def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>; // ASIMD absolute diff accum -def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>; +def : InstRW<[CortexA510Write<6, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>; // ASIMD absolute diff long def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDLv")>; // ASIMD arith #1 -def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)", - "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>; -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)", - "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v", + "[SU]R?HADDv", "[SU]HSUBv")>; // ASIMD arith #2 -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$", +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$", "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$", - "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$", "ADDPv(2i32|4i16|8i8)$")>; -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$", +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$", "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$", - "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$", "ADDPv(16i8|2i64|4i32|8i16)$")>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$")>; // ASIMD arith #3 -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SADDLv", "UADDLv", "SADDWv", - "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SADDLv", "UADDLv", "SADDWv", + "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv")>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ADDHNv", "SUBHNv")>; // ASIMD arith #5 -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>; +def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>; // ASIMD arith, reduce -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ADDVv", "SADDLVv", "UADDLVv")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ADDVv")>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SADDLVv", "UADDLVv")>; // ASIMD compare #1 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>; def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>; @@ -437,8 +437,8 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT| def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8", "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>; // ASIMD max/min, basic -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>; -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>; // SIMD max/min, reduce def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MAX|MIN)Vv")>; // ASIMD multiply, by element @@ -467,12 +467,12 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]MULLv", " // ASIMD polynomial (8x8) multiply long def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>; // ASIMD pairwise add and accumulate -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>; +def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>; // ASIMD shift accumulate -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>; -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>; // ASIMD shift accumulate #2 -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>; +def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>; // ASIMD shift by immed def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SHLd$", "SHLv", "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>; @@ -504,7 +504,7 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(2i def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]rr$", "^AESI?MCrr")>; // Crypto polynomial (64x64) multiply long -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>; +def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>; // Crypto SHA1 hash acceleration op // Crypto SHA1 schedule acceleration ops @@ -512,25 +512,26 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SHA1(H|SU0|S // Crypto SHA1 hash acceleration ops // Crypto SHA256 hash acceleration ops -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>; +def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>; // Crypto SHA256 schedule acceleration ops -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>; +def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>; // Crypto SHA512 hash acceleration ops -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>; +def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>; // Crypto SHA3 ops -def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3, XAR)>; -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs RAX1)>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3)>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs XAR)>; +def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs RAX1)>; // Crypto SM3 ops -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$", +def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$")>; // Crypto SM4 ops -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>; +def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>; // CRC // ----------------------------------------------------------------------------- @@ -540,25 +541,25 @@ def : InstRW<[CortexA510MCWrite<2, 0, CortexA510UnitMAC>], (instregex "^CRC32")> // SVE Predicate instructions // Loop control, based on predicate -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP, +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP, BRKB_PPmP, BRKB_PPzP)>; // Loop control, based on predicate and flag setting -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>; // Loop control, propagating -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>; // Loop control, propagating and flag setting -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>; -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>; // Loop control, based on GPR -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>; -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>; // Loop terminate def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instregex "^CTERM(EQ|NE)_(WW|XX)")>; @@ -569,20 +570,20 @@ def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs ADDPL_XXI, ADDVL_X def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instregex "^CNT[BHWD]_XPiI")>; -def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], +def : InstRW<[CortexA510Write<3, CortexA510UnitALU>], (instregex "^(INC|DEC)[BHWD]_XPiI")>; -def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], +def : InstRW<[CortexA510Write<4, CortexA510UnitALU>], (instregex "^(SQINC|SQDEC|UQINC|UQDEC)[BHWD]_[XW]Pi(Wd)?I")>; // Predicate counting scalar, active predicate -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^CNTP_XPP_[BHSD]")>; -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^(DEC|INC)P_XP_[BHSD]")>; -def : InstRW<[CortexA510Write<8, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<9, CortexA510UnitVALU0>], (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]", "^(UQDEC|UQINC)P_WP_[BHSD]", "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]")>; @@ -593,39 +594,39 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>; // Predicate logical -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>; // Predicate logical, flag setting -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>; // Predicate reverse -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>; // Predicate select -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs SEL_PPPP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs SEL_PPPP)>; // Predicate set -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>; // Predicate set/initialize, set flags -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>; // Predicate find first/next -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>; // Predicate test -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs PTEST_PP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PTEST_PP)>; // Predicate transpose -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>; // Predicate unpack and widen -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>; // Predicate zip/unzip -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>; // SVE integer instructions @@ -634,10 +635,10 @@ def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[1 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABD_(ZPmZ|ZPZZ)_[BHSD]")>; // Arithmetic, absolute diff accum -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>; +def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>; // Arithmetic, absolute diff accum long -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>; +def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>; // Arithmetic, absolute diff long def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>; @@ -651,20 +652,22 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], "^(ADD|SUB|SUBR)_ZI_[BHSD]", "^ADR_[SU]XTW_ZZZ_D_[0123]", "^ADR_LSL_ZZZ_[SD]_[0123]", - "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]", + "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]")>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], + (instregex "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]", "^SADDLBT_ZZZ_[HSD]", - "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]", "^SSUBL(BT|TB)_ZZZ_[HSD]")>; // Arithmetic, complex def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], - (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]", - "^SQ(ABS|NEG)_ZPmZ_[BHSD]", + (instregex "^SQ(ABS|NEG)_ZPmZ_[BHSD]", "^SQ(ADD|SUB|SUBR)_ZPmZ_?[BHSD]", "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]", "^[SU]Q(ADD|SUB)_ZI_[BHSD]", "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]", "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>; +def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], + (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]")>; // Arithmetic, large integer def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>; @@ -735,14 +738,14 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BSL|BSL1N|B // Count/reverse bits def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]")>; -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>; def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_S")>; def : InstRW<[CortexA510Write<12, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_D")>; // Broadcast logical bitmask immediate to vector def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs DUPM_ZI)>; // Compare and set flags -def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]", "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>; @@ -939,12 +942,14 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZ // Multiply/multiply long, (8x8) polynomial def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^PMUL_ZZZ_B")>; -def : InstRW<[CortexA510Write<6, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>; +def : InstRW<[CortexA510Write<9, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>; // Predicate counting vector +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], + (instregex "^(DEC|INC)[HWD]_ZPiI")>; def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], - (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>; + (instregex "^(SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>; // Reciprocal estimate def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>; @@ -965,7 +970,7 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MA def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]")>; // Reverse, vector -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]", +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]", "^REVB_ZPmZ_[HSD]", "^REVH_ZPmZ_[SD]", "^REVW_ZPmZ_D")>; @@ -980,13 +985,13 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBL_ZZZZ?_[B def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBX_ZZZ_[BHSD]")>; // Transpose, vector form -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>; // Unpack and extend def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>; // Zip/unzip -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>; // SVE floating-point instructions // ----------------------------------------------------------------------------- @@ -1142,7 +1147,7 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTMAD_ZZI_[H // Floating point trigonometric, miscellaneous def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTSMUL_ZZZ_[HSD]")>; -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>; // SVE BFloat16 (BF16) instructions @@ -1251,12 +1256,12 @@ def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], "^GLD(FF)?1D(_SCALED)?$")>; // Gather load, 32-bit scaled offset -def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLd>], +def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED$", "^GLD(FF)?1W_[SU]XTW_SCALED")>; // Gather load, 32-bit unpacked unscaled offset -def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$", +def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$", "^GLD(FF)?1W_[SU]XTW$")>; def : InstRW<[CortexA510Write<0, CortexA510UnitVALU>], (instregex "^PRF(B|H|W|D).*")>; @@ -1377,12 +1382,12 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]_ZZZ_ "^AESI?MC_ZZ_B$")>; // Crypto SHA3 ops -def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$", +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$", "^XAR_ZZZI_[BHSD]$")>; -def : InstRW<[CortexA510MC_RC0Write<8, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>; +def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>; // Crypto SM4 ops -def : InstRW<[CortexA510MC_RC0Write<8, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>; +def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll index ceef0c49a45ec7..9a525151ca328b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -140,10 +140,10 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) { ; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h ; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h ; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] ; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h ; GISEL-NEXT: usra v1.8h, v0.8h, #1 -; GISEL-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] -; GISEL-NEXT: neg v0.8h, v0.8h +; GISEL-NEXT: neg v0.8h, v2.8h ; GISEL-NEXT: ushl v0.8h, v1.8h, v0.8h ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, @@ -170,13 +170,13 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; GISEL-LABEL: combine_vec_udiv_nonuniform4: ; GISEL: // %bb.0: ; GISEL-NEXT: adrp x8, .LCPI4_2 +; GISEL-NEXT: adrp x9, .LCPI4_0 ; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI4_2] ; GISEL-NEXT: adrp x8, .LCPI4_1 +; GISEL-NEXT: ldr q4, [x9, :lo12:.LCPI4_0] ; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI4_1] -; GISEL-NEXT: adrp x8, .LCPI4_0 ; GISEL-NEXT: umull2 v2.8h, v0.16b, v1.16b ; GISEL-NEXT: umull v1.8h, v0.8b, v1.8b -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI4_0] ; GISEL-NEXT: uzp2 v1.16b, v1.16b, v2.16b ; GISEL-NEXT: neg v2.16b, v3.16b ; GISEL-NEXT: shl v3.16b, v4.16b, #7 diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll index b77d591347659a..ee035ec1941d57 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -101,12 +101,12 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias ; GISEL-NEXT: ushll v2.8h, v2.8b, #0 ; GISEL-NEXT: usubl v3.4s, v1.4h, v2.4h ; GISEL-NEXT: usubl2 v1.4s, v1.8h, v2.8h -; GISEL-NEXT: neg v2.4s, v3.4s -; GISEL-NEXT: neg v4.4s, v1.4s -; GISEL-NEXT: cmgt v5.4s, v0.4s, v3.4s +; GISEL-NEXT: cmgt v2.4s, v0.4s, v3.4s ; GISEL-NEXT: cmgt v0.4s, v0.4s, v1.4s -; GISEL-NEXT: bif v2.16b, v3.16b, v5.16b -; GISEL-NEXT: bsl v0.16b, v4.16b, v1.16b +; GISEL-NEXT: neg v4.4s, v3.4s +; GISEL-NEXT: neg v5.4s, v1.4s +; GISEL-NEXT: bsl v2.16b, v4.16b, v3.16b +; GISEL-NEXT: bsl v0.16b, v5.16b, v1.16b ; GISEL-NEXT: add v0.4s, v2.4s, v0.4s ; GISEL-NEXT: addv s0, v0.4s ; GISEL-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll index fdeae9f326ad83..36b81d8e495ce6 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll @@ -4,8 +4,8 @@ define @dupsext_v2i8_v2i16(i8 %src, %b) { ; CHECK-LABEL: dupsext_v2i8_v2i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -20,8 +20,8 @@ entry: define @dupsext_v4i8_v4i16(i8 %src, %b) { ; CHECK-LABEL: dupsext_v4i8_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -36,8 +36,8 @@ entry: define @dupsext_v8i8_v8i16(i8 %src, %b) { ; CHECK-LABEL: dupsext_v8i8_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -52,8 +52,8 @@ entry: define @dupsext_v2i8_v2i32(i8 %src, %b) { ; CHECK-LABEL: dupsext_v2i8_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -68,8 +68,8 @@ entry: define @dupsext_v4i8_v4i32(i8 %src, %b) { ; CHECK-LABEL: dupsext_v4i8_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -84,9 +84,9 @@ entry: define @dupsext_v2i8_v2i64(i8 %src, %b) { ; CHECK-LABEL: dupsext_v2i8_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtb x8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -101,8 +101,8 @@ entry: define @dupsext_v2i16_v2i32(i16 %src, %b) { ; CHECK-LABEL: dupsext_v2i16_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -117,8 +117,8 @@ entry: define @dupsext_v4i16_v4i32(i16 %src, %b) { ; CHECK-LABEL: dupsext_v4i16_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -133,9 +133,9 @@ entry: define @dupsext_v2i16_v2i64(i16 %src, %b) { ; CHECK-LABEL: dupsext_v2i16_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxth x8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -150,9 +150,9 @@ entry: define @dupsext_v2i32_v2i64(i32 %src, %b) { ; CHECK-LABEL: dupsext_v2i32_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -167,8 +167,8 @@ entry: define @dupzext_v2i8_v2i16(i8 %src, %b) { ; CHECK-LABEL: dupzext_v2i8_v2i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -183,8 +183,8 @@ entry: define @dupzext_v4i8_v4i16(i8 %src, %b) { ; CHECK-LABEL: dupzext_v4i8_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -199,8 +199,8 @@ entry: define @dupzext_v8i8_v8i16(i8 %src, %b) { ; CHECK-LABEL: dupzext_v8i8_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -215,8 +215,8 @@ entry: define @dupzext_v2i8_v2i32(i8 %src, %b) { ; CHECK-LABEL: dupzext_v2i8_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -231,8 +231,8 @@ entry: define @dupzext_v4i8_v4i32(i8 %src, %b) { ; CHECK-LABEL: dupzext_v4i8_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -247,9 +247,9 @@ entry: define @dupzext_v2i8_v2i64(i8 %src, %b) { ; CHECK-LABEL: dupzext_v2i8_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -264,8 +264,8 @@ entry: define @dupzext_v2i16_v2i32(i16 %src, %b) { ; CHECK-LABEL: dupzext_v2i16_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -280,8 +280,8 @@ entry: define @dupzext_v4i16_v4i32(i16 %src, %b) { ; CHECK-LABEL: dupzext_v4i16_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -296,9 +296,9 @@ entry: define @dupzext_v2i16_v2i64(i16 %src, %b) { ; CHECK-LABEL: dupzext_v2i16_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -313,8 +313,8 @@ entry: define @dupzext_v2i32_v2i64(i32 %src, %b) { ; CHECK-LABEL: dupzext_v2i32_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 61a4f64ac2bfcb..540471a05901ae 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -257,8 +257,8 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldrh w8, [x0] -; CHECK-SVE-NEXT: ptrue p0.d, vl2 ; CHECK-SVE-NEXT: ldrh w9, [x0, #2] +; CHECK-SVE-NEXT: ptrue p0.d, vl2 ; CHECK-SVE-NEXT: ldr d0, [x1] ; CHECK-SVE-NEXT: fmov d1, x8 ; CHECK-SVE-NEXT: sshll v0.2d, v0.2s, #0 diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll index a65c5d66677946..43122c8c953fc7 100644 --- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -131,9 +131,9 @@ define @lane_mask_nxv2i1_i8(i8 %index, i8 %TC) { ; CHECK-NEXT: index z0.d, #0, #1 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xff +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: and x8, x1, #0xff ; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: add z0.d, z0.d, z1.d @@ -153,6 +153,7 @@ define @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) { ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill @@ -160,16 +161,16 @@ define @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: index z0.s, #0, #1 ; CHECK-NEXT: mov z1.s, w0 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z25.s, w1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.d, z0.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: uqadd z6.s, z0.s, z1.s ; CHECK-NEXT: incw z0.s, all, mul #4 ; CHECK-NEXT: incw z2.s ; CHECK-NEXT: incw z3.s, all, mul #2 -; CHECK-NEXT: cmphi p2.s, p0/z, z25.s, z6.s ; CHECK-NEXT: uqadd z0.s, z0.s, z1.s +; CHECK-NEXT: cmphi p2.s, p0/z, z25.s, z6.s ; CHECK-NEXT: mov z4.d, z2.d ; CHECK-NEXT: uqadd z5.s, z2.s, z1.s ; CHECK-NEXT: uqadd z7.s, z3.s, z1.s @@ -177,25 +178,26 @@ define @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) { ; CHECK-NEXT: incw z3.s, all, mul #4 ; CHECK-NEXT: cmphi p5.s, p0/z, z25.s, z0.s ; CHECK-NEXT: incw z4.s, all, mul #2 -; CHECK-NEXT: cmphi p1.s, p0/z, z25.s, z5.s -; CHECK-NEXT: cmphi p3.s, p0/z, z25.s, z7.s ; CHECK-NEXT: uqadd z2.s, z2.s, z1.s ; CHECK-NEXT: uqadd z3.s, z3.s, z1.s +; CHECK-NEXT: cmphi p1.s, p0/z, z25.s, z5.s +; CHECK-NEXT: cmphi p3.s, p0/z, z25.s, z7.s ; CHECK-NEXT: uqadd z24.s, z4.s, z1.s ; CHECK-NEXT: incw z4.s, all, mul #4 -; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h ; CHECK-NEXT: cmphi p6.s, p0/z, z25.s, z2.s -; CHECK-NEXT: cmphi p2.s, p0/z, z25.s, z3.s +; CHECK-NEXT: cmphi p7.s, p0/z, z25.s, z3.s +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h ; CHECK-NEXT: uqadd z1.s, z4.s, z1.s ; CHECK-NEXT: cmphi p4.s, p0/z, z25.s, z24.s -; CHECK-NEXT: uzp1 p3.h, p3.h, p4.h ; CHECK-NEXT: cmphi p0.s, p0/z, z25.s, z1.s -; CHECK-NEXT: uzp1 p4.h, p5.h, p6.h +; CHECK-NEXT: uzp1 p2.h, p3.h, p4.h +; CHECK-NEXT: uzp1 p3.h, p5.h, p6.h ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p2.h, p2.h, p0.h -; CHECK-NEXT: uzp1 p0.b, p1.b, p3.b -; CHECK-NEXT: uzp1 p1.b, p4.b, p2.b +; CHECK-NEXT: uzp1 p4.h, p7.h, p0.h +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.b, p1.b, p2.b +; CHECK-NEXT: uzp1 p1.b, p3.b, p4.b ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -208,96 +210,97 @@ define @lane_mask_nxv32i1_i64(i64 %index, i64 %TC) { ; CHECK-LABEL: lane_mask_nxv32i1_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: index z5.d, #0, #1 ; CHECK-NEXT: mov z0.d, x0 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z3.d, x1 -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: uqadd z25.d, z1.d, z0.d -; CHECK-NEXT: incd z1.d, all, mul #8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z2.d, z5.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: mov z4.d, z5.d +; CHECK-NEXT: uqadd z25.d, z5.d, z0.d +; CHECK-NEXT: incd z5.d, all, mul #8 ; CHECK-NEXT: incd z2.d -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: incd z6.d, all, mul #4 -; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z25.d -; CHECK-NEXT: uqadd z1.d, z1.d, z0.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: uqadd z26.d, z2.d, z0.d +; CHECK-NEXT: incd z1.d, all, mul #2 +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: uqadd z5.d, z5.d, z0.d +; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z25.d +; CHECK-NEXT: mov z6.d, z2.d ; CHECK-NEXT: mov z7.d, z2.d -; CHECK-NEXT: mov z24.d, z4.d -; CHECK-NEXT: uqadd z27.d, z4.d, z0.d -; CHECK-NEXT: uqadd z28.d, z6.d, z0.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: uqadd z26.d, z2.d, z0.d +; CHECK-NEXT: uqadd z27.d, z1.d, z0.d +; CHECK-NEXT: uqadd z28.d, z4.d, z0.d ; CHECK-NEXT: incd z2.d, all, mul #8 +; CHECK-NEXT: incd z1.d, all, mul #8 ; CHECK-NEXT: incd z4.d, all, mul #8 -; CHECK-NEXT: incd z6.d, all, mul #8 -; CHECK-NEXT: incd z5.d, all, mul #2 +; CHECK-NEXT: incd z6.d, all, mul #2 ; CHECK-NEXT: incd z7.d, all, mul #4 -; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z26.d ; CHECK-NEXT: incd z24.d, all, mul #4 -; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z27.d -; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z28.d +; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z26.d +; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z27.d +; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z28.d +; CHECK-NEXT: mov z31.d, z6.d +; CHECK-NEXT: uqadd z29.d, z6.d, z0.d +; CHECK-NEXT: uqadd z30.d, z7.d, z0.d +; CHECK-NEXT: uqadd z8.d, z24.d, z0.d +; CHECK-NEXT: incd z6.d, all, mul #8 +; CHECK-NEXT: incd z7.d, all, mul #8 +; CHECK-NEXT: incd z24.d, all, mul #8 ; CHECK-NEXT: uqadd z2.d, z2.d, z0.d +; CHECK-NEXT: uqadd z1.d, z1.d, z0.d +; CHECK-NEXT: incd z31.d, all, mul #4 ; CHECK-NEXT: uqadd z4.d, z4.d, z0.d +; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s +; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z29.d +; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z30.d ; CHECK-NEXT: uqadd z6.d, z6.d, z0.d -; CHECK-NEXT: mov z26.d, z5.d -; CHECK-NEXT: uqadd z25.d, z5.d, z0.d -; CHECK-NEXT: uqadd z27.d, z7.d, z0.d -; CHECK-NEXT: incd z5.d, all, mul #8 -; CHECK-NEXT: incd z7.d, all, mul #8 -; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s -; CHECK-NEXT: incd z26.d, all, mul #4 -; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z2.d -; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z25.d -; CHECK-NEXT: uqadd z25.d, z24.d, z0.d -; CHECK-NEXT: incd z24.d, all, mul #8 -; CHECK-NEXT: uqadd z5.d, z5.d, z0.d +; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z8.d +; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: uqadd z7.d, z7.d, z0.d -; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z27.d -; CHECK-NEXT: uqadd z28.d, z26.d, z0.d -; CHECK-NEXT: incd z26.d, all, mul #8 -; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s +; CHECK-NEXT: uqadd z25.d, z31.d, z0.d +; CHECK-NEXT: incd z31.d, all, mul #8 ; CHECK-NEXT: uqadd z24.d, z24.d, z0.d -; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z25.d -; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z1.d -; CHECK-NEXT: uzp1 p5.s, p5.s, p6.s -; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z4.d -; CHECK-NEXT: cmphi p9.d, p0/z, z3.d, z5.d -; CHECK-NEXT: cmphi p10.d, p0/z, z3.d, z7.d -; CHECK-NEXT: uqadd z0.d, z26.d, z0.d -; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z28.d -; CHECK-NEXT: uzp1 p4.s, p4.s, p8.s -; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z24.d -; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s +; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z5.d +; CHECK-NEXT: uzp1 p2.s, p2.s, p5.s +; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z2.d +; CHECK-NEXT: cmphi p9.d, p0/z, z3.d, z6.d +; CHECK-NEXT: uqadd z0.d, z31.d, z0.d +; CHECK-NEXT: uzp1 p1.s, p1.s, p7.s +; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z1.d +; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z25.d +; CHECK-NEXT: uzp1 p2.h, p3.h, p2.h +; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z7.d +; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s +; CHECK-NEXT: uzp1 p5.s, p7.s, p9.s ; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h -; CHECK-NEXT: uzp1 p2.s, p7.s, p2.s -; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z6.d +; CHECK-NEXT: uzp1 p6.s, p6.s, p8.s +; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z4.d +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p4.h, p4.h, p5.h +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.h, p1.h, p6.h +; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z24.d ; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z0.d -; CHECK-NEXT: uzp1 p7.s, p7.s, p10.s -; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.s, p8.s, p0.s +; CHECK-NEXT: uzp1 p3.s, p8.s, p3.s ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p3.h, p4.h, p6.h +; CHECK-NEXT: uzp1 p0.s, p6.s, p0.s ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p2.h, p5.h, p2.h -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p4.h, p7.h, p0.h -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.b, p1.b, p2.b -; CHECK-NEXT: uzp1 p1.b, p3.b, p4.b +; CHECK-NEXT: uzp1 p3.h, p3.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p2.b, p1.b +; CHECK-NEXT: uzp1 p1.b, p4.b, p3.b ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 %index, i64 %TC) @@ -459,12 +462,12 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) { ; CHECK-NEXT: adrp x8, .LCPI26_0 ; CHECK-NEXT: movi d2, #0xff00ff00ff00ff ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI26_0] +; CHECK-NEXT: dup v3.4h, w1 ; CHECK-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-NEXT: bic v3.4h, #255, lsl #8 ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: dup v1.4h, w1 ; CHECK-NEXT: umin v0.4h, v0.4h, v2.4h -; CHECK-NEXT: bic v1.4h, #255, lsl #8 -; CHECK-NEXT: cmhi v0.4h, v1.4h, v0.4h +; CHECK-NEXT: cmhi v0.4h, v3.4h, v0.4h ; CHECK-NEXT: ret %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8 %index, i8 %TC) ret <4 x i1> %active.lane.mask @@ -480,9 +483,9 @@ define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) { ; CHECK-NEXT: dup v3.2s, w1 ; CHECK-NEXT: and v1.8b, v1.8b, v0.8b ; CHECK-NEXT: add v1.2s, v1.2s, v2.2s -; CHECK-NEXT: umin v1.2s, v1.2s, v0.2s -; CHECK-NEXT: and v0.8b, v3.8b, v0.8b -; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s +; CHECK-NEXT: and v2.8b, v3.8b, v0.8b +; CHECK-NEXT: umin v0.2s, v1.2s, v0.2s +; CHECK-NEXT: cmhi v0.2s, v2.2s, v0.2s ; CHECK-NEXT: ret %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC) ret <2 x i1> %active.lane.mask diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll index 3007e7ce771e62..508f68d6f14d43 100644 --- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -54,19 +54,19 @@ define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) { ; CHECK-LABEL: uitofp_v4i64_to_v4bf16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NEXT: ucvtf v1.2d, v1.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, ptr %ptr diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll index fe4da2e7cf36b5..5b45ba2552cefd 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -257,12 +257,12 @@ define i16 @uabd16b_rdx(ptr %a, ptr %b) { ; CHECK-GI-NEXT: movi.2d v0, #0000000000000000 ; CHECK-GI-NEXT: usubl.8h v3, v1, v2 ; CHECK-GI-NEXT: usubl2.8h v1, v1, v2 -; CHECK-GI-NEXT: neg.8h v2, v3 -; CHECK-GI-NEXT: neg.8h v4, v1 -; CHECK-GI-NEXT: cmgt.8h v5, v0, v3 +; CHECK-GI-NEXT: cmgt.8h v2, v0, v3 ; CHECK-GI-NEXT: cmgt.8h v0, v0, v1 -; CHECK-GI-NEXT: bif.16b v2, v3, v5 -; CHECK-GI-NEXT: bsl.16b v0, v4, v1 +; CHECK-GI-NEXT: neg.8h v4, v3 +; CHECK-GI-NEXT: neg.8h v5, v1 +; CHECK-GI-NEXT: bsl.16b v2, v4, v3 +; CHECK-GI-NEXT: bsl.16b v0, v5, v1 ; CHECK-GI-NEXT: add.8h v0, v2, v0 ; CHECK-GI-NEXT: addv.8h h0, v0 ; CHECK-GI-NEXT: fmov w0, s0 @@ -299,18 +299,18 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-GI-NEXT: usubl2.4s v3, v3, v4 ; CHECK-GI-NEXT: usubl.4s v4, v0, v1 ; CHECK-GI-NEXT: usubl2.4s v0, v0, v1 -; CHECK-GI-NEXT: neg.4s v6, v5 -; CHECK-GI-NEXT: neg.4s v7, v3 ; CHECK-GI-NEXT: cmgt.4s v1, v2, v5 -; CHECK-GI-NEXT: neg.4s v16, v4 -; CHECK-GI-NEXT: neg.4s v17, v0 -; CHECK-GI-NEXT: cmgt.4s v18, v2, v3 -; CHECK-GI-NEXT: cmgt.4s v19, v2, v4 +; CHECK-GI-NEXT: cmgt.4s v6, v2, v3 +; CHECK-GI-NEXT: neg.4s v16, v5 +; CHECK-GI-NEXT: cmgt.4s v7, v2, v4 ; CHECK-GI-NEXT: cmgt.4s v2, v2, v0 -; CHECK-GI-NEXT: bsl.16b v1, v6, v5 -; CHECK-GI-NEXT: bit.16b v3, v7, v18 -; CHECK-GI-NEXT: bit.16b v4, v16, v19 -; CHECK-GI-NEXT: bit.16b v0, v17, v2 +; CHECK-GI-NEXT: neg.4s v17, v3 +; CHECK-GI-NEXT: neg.4s v18, v4 +; CHECK-GI-NEXT: neg.4s v19, v0 +; CHECK-GI-NEXT: bsl.16b v1, v16, v5 +; CHECK-GI-NEXT: bit.16b v3, v17, v6 +; CHECK-GI-NEXT: bit.16b v4, v18, v7 +; CHECK-GI-NEXT: bit.16b v0, v19, v2 ; CHECK-GI-NEXT: add.4s v1, v1, v3 ; CHECK-GI-NEXT: add.4s v0, v4, v0 ; CHECK-GI-NEXT: add.4s v0, v1, v0 @@ -347,18 +347,18 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-GI-NEXT: ssubl2.4s v3, v3, v4 ; CHECK-GI-NEXT: ssubl.4s v4, v0, v1 ; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1 -; CHECK-GI-NEXT: neg.4s v6, v5 -; CHECK-GI-NEXT: neg.4s v7, v3 ; CHECK-GI-NEXT: cmgt.4s v1, v2, v5 -; CHECK-GI-NEXT: neg.4s v16, v4 -; CHECK-GI-NEXT: neg.4s v17, v0 -; CHECK-GI-NEXT: cmgt.4s v18, v2, v3 -; CHECK-GI-NEXT: cmgt.4s v19, v2, v4 +; CHECK-GI-NEXT: cmgt.4s v6, v2, v3 +; CHECK-GI-NEXT: neg.4s v16, v5 +; CHECK-GI-NEXT: cmgt.4s v7, v2, v4 ; CHECK-GI-NEXT: cmgt.4s v2, v2, v0 -; CHECK-GI-NEXT: bsl.16b v1, v6, v5 -; CHECK-GI-NEXT: bit.16b v3, v7, v18 -; CHECK-GI-NEXT: bit.16b v4, v16, v19 -; CHECK-GI-NEXT: bit.16b v0, v17, v2 +; CHECK-GI-NEXT: neg.4s v17, v3 +; CHECK-GI-NEXT: neg.4s v18, v4 +; CHECK-GI-NEXT: neg.4s v19, v0 +; CHECK-GI-NEXT: bsl.16b v1, v16, v5 +; CHECK-GI-NEXT: bit.16b v3, v17, v6 +; CHECK-GI-NEXT: bit.16b v4, v18, v7 +; CHECK-GI-NEXT: bit.16b v0, v19, v2 ; CHECK-GI-NEXT: add.4s v1, v1, v3 ; CHECK-GI-NEXT: add.4s v0, v4, v0 ; CHECK-GI-NEXT: add.4s v0, v1, v0 @@ -396,12 +396,12 @@ define i32 @uabd8h_rdx(ptr %a, ptr %b) { ; CHECK-GI-NEXT: movi.2d v0, #0000000000000000 ; CHECK-GI-NEXT: usubl.4s v3, v1, v2 ; CHECK-GI-NEXT: usubl2.4s v1, v1, v2 -; CHECK-GI-NEXT: neg.4s v2, v3 -; CHECK-GI-NEXT: neg.4s v4, v1 -; CHECK-GI-NEXT: cmgt.4s v5, v0, v3 +; CHECK-GI-NEXT: cmgt.4s v2, v0, v3 ; CHECK-GI-NEXT: cmgt.4s v0, v0, v1 -; CHECK-GI-NEXT: bif.16b v2, v3, v5 -; CHECK-GI-NEXT: bsl.16b v0, v4, v1 +; CHECK-GI-NEXT: neg.4s v4, v3 +; CHECK-GI-NEXT: neg.4s v5, v1 +; CHECK-GI-NEXT: bsl.16b v2, v4, v3 +; CHECK-GI-NEXT: bsl.16b v0, v5, v1 ; CHECK-GI-NEXT: add.4s v0, v2, v0 ; CHECK-GI-NEXT: addv.4s s0, v0 ; CHECK-GI-NEXT: fmov w0, s0 @@ -428,15 +428,15 @@ define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) { ; ; CHECK-GI-LABEL: sabd8h_rdx: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 ; CHECK-GI-NEXT: ssubl.4s v3, v0, v1 ; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1 -; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 -; CHECK-GI-NEXT: neg.4s v1, v3 -; CHECK-GI-NEXT: neg.4s v4, v0 -; CHECK-GI-NEXT: cmgt.4s v5, v2, v3 +; CHECK-GI-NEXT: neg.4s v4, v3 +; CHECK-GI-NEXT: neg.4s v5, v0 +; CHECK-GI-NEXT: cmgt.4s v1, v2, v3 ; CHECK-GI-NEXT: cmgt.4s v2, v2, v0 -; CHECK-GI-NEXT: bif.16b v1, v3, v5 -; CHECK-GI-NEXT: bit.16b v0, v4, v2 +; CHECK-GI-NEXT: bsl.16b v1, v4, v3 +; CHECK-GI-NEXT: bit.16b v0, v5, v2 ; CHECK-GI-NEXT: add.4s v0, v1, v0 ; CHECK-GI-NEXT: addv.4s s0, v0 ; CHECK-GI-NEXT: fmov w0, s0 @@ -461,10 +461,10 @@ define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) { ; ; CHECK-GI-LABEL: uabdl4s_rdx_i32: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 ; CHECK-GI-NEXT: usubl.4s v0, v0, v1 -; CHECK-GI-NEXT: movi.2d v1, #0000000000000000 +; CHECK-GI-NEXT: cmgt.4s v1, v2, v0 ; CHECK-GI-NEXT: neg.4s v2, v0 -; CHECK-GI-NEXT: cmgt.4s v1, v1, v0 ; CHECK-GI-NEXT: bit.16b v0, v2, v1 ; CHECK-GI-NEXT: addv.4s s0, v0 ; CHECK-GI-NEXT: fmov w0, s0 @@ -499,12 +499,12 @@ define i64 @uabd4s_rdx(ptr %a, ptr %b, i32 %h) { ; CHECK-GI-NEXT: movi.2d v0, #0000000000000000 ; CHECK-GI-NEXT: usubl.2d v3, v1, v2 ; CHECK-GI-NEXT: usubl2.2d v1, v1, v2 -; CHECK-GI-NEXT: neg.2d v2, v3 -; CHECK-GI-NEXT: neg.2d v4, v1 -; CHECK-GI-NEXT: cmgt.2d v5, v0, v3 +; CHECK-GI-NEXT: cmgt.2d v2, v0, v3 ; CHECK-GI-NEXT: cmgt.2d v0, v0, v1 -; CHECK-GI-NEXT: bif.16b v2, v3, v5 -; CHECK-GI-NEXT: bsl.16b v0, v4, v1 +; CHECK-GI-NEXT: neg.2d v4, v3 +; CHECK-GI-NEXT: neg.2d v5, v1 +; CHECK-GI-NEXT: bsl.16b v2, v4, v3 +; CHECK-GI-NEXT: bsl.16b v0, v5, v1 ; CHECK-GI-NEXT: add.2d v0, v2, v0 ; CHECK-GI-NEXT: addp.2d d0, v0 ; CHECK-GI-NEXT: fmov x0, d0 @@ -531,15 +531,15 @@ define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) { ; ; CHECK-GI-LABEL: sabd4s_rdx: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 ; CHECK-GI-NEXT: ssubl.2d v3, v0, v1 ; CHECK-GI-NEXT: ssubl2.2d v0, v0, v1 -; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 -; CHECK-GI-NEXT: neg.2d v1, v3 -; CHECK-GI-NEXT: neg.2d v4, v0 -; CHECK-GI-NEXT: cmgt.2d v5, v2, v3 +; CHECK-GI-NEXT: neg.2d v4, v3 +; CHECK-GI-NEXT: neg.2d v5, v0 +; CHECK-GI-NEXT: cmgt.2d v1, v2, v3 ; CHECK-GI-NEXT: cmgt.2d v2, v2, v0 -; CHECK-GI-NEXT: bif.16b v1, v3, v5 -; CHECK-GI-NEXT: bit.16b v0, v4, v2 +; CHECK-GI-NEXT: bsl.16b v1, v4, v3 +; CHECK-GI-NEXT: bit.16b v0, v5, v2 ; CHECK-GI-NEXT: add.2d v0, v1, v0 ; CHECK-GI-NEXT: addp.2d d0, v0 ; CHECK-GI-NEXT: fmov x0, d0 @@ -564,10 +564,10 @@ define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) { ; ; CHECK-GI-LABEL: uabdl2d_rdx_i64: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 ; CHECK-GI-NEXT: usubl.2d v0, v0, v1 -; CHECK-GI-NEXT: movi.2d v1, #0000000000000000 +; CHECK-GI-NEXT: cmgt.2d v1, v2, v0 ; CHECK-GI-NEXT: neg.2d v2, v0 -; CHECK-GI-NEXT: cmgt.2d v1, v1, v0 ; CHECK-GI-NEXT: bit.16b v0, v2, v1 ; CHECK-GI-NEXT: addp.2d d0, v0 ; CHECK-GI-NEXT: fmov x0, d0 @@ -1796,10 +1796,10 @@ define <2 x i64> @uabd_i32(<2 x i32> %a, <2 x i32> %b) { ; ; CHECK-GI-LABEL: uabd_i32: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 ; CHECK-GI-NEXT: ssubl.2d v0, v0, v1 -; CHECK-GI-NEXT: movi.2d v1, #0000000000000000 +; CHECK-GI-NEXT: cmgt.2d v1, v2, v0 ; CHECK-GI-NEXT: neg.2d v2, v0 -; CHECK-GI-NEXT: cmgt.2d v1, v1, v0 ; CHECK-GI-NEXT: bit.16b v0, v2, v1 ; CHECK-GI-NEXT: ret %aext = sext <2 x i32> %a to <2 x i64> diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll index cafee32ada6868..d4cc154ac6afc0 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll @@ -205,15 +205,15 @@ define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp { ; GENERIC-LABEL: test_vcvt_bf16_f64: ; GENERIC: // %bb.0: ; GENERIC-NEXT: fcvtxn v0.2s, v0.2d -; GENERIC-NEXT: movi.4s v1, #127, msl #8 -; GENERIC-NEXT: movi.4s v2, #1 +; GENERIC-NEXT: movi.4s v1, #1 +; GENERIC-NEXT: movi.4s v2, #127, msl #8 ; GENERIC-NEXT: ushr.4s v3, v0, #16 -; GENERIC-NEXT: add.4s v1, v0, v1 -; GENERIC-NEXT: and.16b v2, v3, v2 -; GENERIC-NEXT: add.4s v1, v2, v1 -; GENERIC-NEXT: fcmeq.4s v2, v0, v0 +; GENERIC-NEXT: add.4s v2, v0, v2 +; GENERIC-NEXT: and.16b v1, v3, v1 +; GENERIC-NEXT: fcmeq.4s v3, v0, v0 ; GENERIC-NEXT: orr.4s v0, #64, lsl #16 -; GENERIC-NEXT: bit.16b v0, v1, v2 +; GENERIC-NEXT: add.4s v1, v1, v2 +; GENERIC-NEXT: bit.16b v0, v1, v3 ; GENERIC-NEXT: shrn.4h v0, v0, #16 ; GENERIC-NEXT: ret ; @@ -238,15 +238,15 @@ define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp { ; GISEL-LABEL: test_vcvt_bf16_f64: ; GISEL: // %bb.0: ; GISEL-NEXT: fcvtxn v0.2s, v0.2d -; GISEL-NEXT: movi.4s v1, #127, msl #8 -; GISEL-NEXT: movi.4s v2, #1 +; GISEL-NEXT: movi.4s v1, #1 +; GISEL-NEXT: movi.4s v2, #127, msl #8 ; GISEL-NEXT: ushr.4s v3, v0, #16 -; GISEL-NEXT: add.4s v1, v0, v1 -; GISEL-NEXT: and.16b v2, v3, v2 -; GISEL-NEXT: add.4s v1, v2, v1 -; GISEL-NEXT: fcmeq.4s v2, v0, v0 +; GISEL-NEXT: add.4s v2, v0, v2 +; GISEL-NEXT: and.16b v1, v3, v1 +; GISEL-NEXT: fcmeq.4s v3, v0, v0 ; GISEL-NEXT: orr.4s v0, #64, lsl #16 -; GISEL-NEXT: bit.16b v0, v1, v2 +; GISEL-NEXT: add.4s v1, v1, v2 +; GISEL-NEXT: bit.16b v0, v1, v3 ; GISEL-NEXT: shrn.4h v0, v0, #16 ; GISEL-NEXT: ret %vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat> diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll index dda610e5dd3cb1..e754f01daa2a9b 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -903,10 +903,10 @@ define <2 x i16> @hadd8x2_sext_lsr(<2 x i8> %src1, <2 x i8> %src2) { ; CHECK: // %bb.0: ; CHECK-NEXT: shl.2s v0, v0, #24 ; CHECK-NEXT: shl.2s v1, v1, #24 +; CHECK-NEXT: movi d2, #0x00ffff0000ffff ; CHECK-NEXT: sshr.2s v0, v0, #24 ; CHECK-NEXT: ssra.2s v0, v1, #24 -; CHECK-NEXT: movi d1, #0x00ffff0000ffff -; CHECK-NEXT: and.8b v0, v0, v1 +; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: ushr.2s v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <2 x i8> %src1 to <2 x i16> @@ -968,10 +968,10 @@ define <4 x i16> @rhadd8_sext_lsr(<4 x i8> %src1, <4 x i8> %src2) { ; CHECK: // %bb.0: ; CHECK-NEXT: shl.4h v0, v0, #8 ; CHECK-NEXT: shl.4h v1, v1, #8 +; CHECK-NEXT: movi.4h v2, #1 ; CHECK-NEXT: sshr.4h v0, v0, #8 ; CHECK-NEXT: ssra.4h v0, v1, #8 -; CHECK-NEXT: movi.4h v1, #1 -; CHECK-NEXT: add.4h v0, v0, v1 +; CHECK-NEXT: add.4h v0, v0, v2 ; CHECK-NEXT: ushr.4h v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i8> %src1 to <4 x i16> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll index ebf5ce20d4ecc4..86b1d5d195ffd8 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll @@ -7,21 +7,22 @@ target triple = "aarch64-unknown-linux-gnu" define @mull_add( %a, %b, %c) { ; CHECK-LABEL: mull_add: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 z6.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z7.d, z2.d, z3.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z6.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z2.d, z3.d +; CHECK-NEXT: uzp1 z2.d, z2.d, z3.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d -; CHECK-NEXT: fmul z2.d, z6.d, z7.d -; CHECK-NEXT: fmul z3.d, z0.d, z7.d -; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d -; CHECK-NEXT: fnmsb z1.d, p0/m, z6.d, z3.d -; CHECK-NEXT: uzp2 z2.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z3.d, z4.d, z5.d -; CHECK-NEXT: fadd z2.d, z0.d, z2.d +; CHECK-NEXT: fmul z7.d, z0.d, z1.d +; CHECK-NEXT: fmul z1.d, z6.d, z1.d +; CHECK-NEXT: movprfx z3, z7 +; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z2.d +; CHECK-NEXT: fnmsb z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z2.d, z4.d, z5.d +; CHECK-NEXT: fadd z2.d, z2.d, z0.d ; CHECK-NEXT: fadd z1.d, z3.d, z1.d -; CHECK-NEXT: zip1 z0.d, z1.d, z2.d -; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: zip1 z0.d, z2.d, z1.d +; CHECK-NEXT: zip2 z1.d, z2.d, z1.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -49,21 +50,21 @@ entry: define @mul_add_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_add_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 -; CHECK-NEXT: fadd z0.d, z25.d, z27.d ; CHECK-NEXT: fadd z1.d, z26.d, z24.d +; CHECK-NEXT: fadd z0.d, z25.d, z27.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -100,21 +101,21 @@ entry: define @mul_sub_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_sub_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 -; CHECK-NEXT: fsub z0.d, z25.d, z27.d ; CHECK-NEXT: fsub z1.d, z26.d, z24.d +; CHECK-NEXT: fsub z0.d, z25.d, z27.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -151,21 +152,21 @@ entry: define @mul_conj_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_conj_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 ; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #0 -; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 ; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #270 -; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270 -; CHECK-NEXT: fadd z0.d, z25.d, z27.d ; CHECK-NEXT: fadd z1.d, z26.d, z24.d +; CHECK-NEXT: fadd z0.d, z25.d, z27.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -206,8 +207,8 @@ define @mul_add_rot_mull( %a, @mul_add_rot_mull( %a, @mul_add_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_add_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d -; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 -; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #90 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 ; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -90,19 +90,19 @@ entry: define @mul_sub_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_sub_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d -; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #270 ; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #270 -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #270 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 -; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #180 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #180 -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #180 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 ; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -139,19 +139,19 @@ entry: define @mul_conj_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_conj_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 -; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 ; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0 -; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #270 +; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270 -; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #270 ; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -188,24 +188,25 @@ entry: define @mul_add_rot_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_add_rot_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 z24.d, z2.d, z3.d +; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d ; CHECK-NEXT: uzp2 z25.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d -; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp1 z2.d, z2.d, z3.d ; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z4.d, z5.d ; CHECK-NEXT: uzp1 z26.d, z6.d, z7.d -; CHECK-NEXT: fmul z1.d, z24.d, z25.d -; CHECK-NEXT: fmul z3.d, z2.d, z25.d -; CHECK-NEXT: uzp2 z25.d, z4.d, z5.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d ; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d -; CHECK-NEXT: fmla z3.d, p0/m, z26.d, z25.d -; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z0.d -; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: fmla z2.d, p0/m, z5.d, z4.d -; CHECK-NEXT: fmla z1.d, p0/m, z26.d, z4.d -; CHECK-NEXT: fnmls z2.d, p0/m, z24.d, z0.d -; CHECK-NEXT: fmls z1.d, p0/m, z5.d, z25.d +; CHECK-NEXT: fmul z3.d, z2.d, z25.d +; CHECK-NEXT: fmul z25.d, z24.d, z25.d +; CHECK-NEXT: fmla z3.d, p0/m, z24.d, z0.d +; CHECK-NEXT: movprfx z24, z25 +; CHECK-NEXT: fmla z24.d, p0/m, z26.d, z1.d +; CHECK-NEXT: movprfx z6, z24 +; CHECK-NEXT: fmla z6.d, p0/m, z5.d, z4.d +; CHECK-NEXT: fmla z3.d, p0/m, z26.d, z4.d +; CHECK-NEXT: fnmsb z2.d, p0/m, z0.d, z6.d +; CHECK-NEXT: fmsb z1.d, p0/m, z5.d, z3.d ; CHECK-NEXT: zip1 z0.d, z2.d, z1.d ; CHECK-NEXT: zip2 z1.d, z2.d, z1.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll index 611cf44ea7ee87..cb285c05b2e86f 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll @@ -16,9 +16,10 @@ define @complex_mul_v4f16( %a, @complex_mul_v8f16( %a, %b) { ; CHECK-LABEL: complex_mul_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: fcmla z2.h, p0/m, z1.h, z0.h, #0 ; CHECK-NEXT: fcmla z2.h, p0/m, z1.h, z0.h, #90 ; CHECK-NEXT: mov z0.d, z2.d @@ -72,15 +73,15 @@ entry: define @complex_mul_v16f16( %a, %b) { ; CHECK-LABEL: complex_mul_v16f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z4.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z5.d, z4.d -; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #0 ; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #0 -; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #90 +; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #0 ; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #90 -; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #90 ; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f16( %a) @@ -103,23 +104,23 @@ entry: define @complex_mul_v32f16( %a, %b) { ; CHECK-LABEL: complex_mul_v32f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z24.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #0 ; CHECK-NEXT: fcmla z25.h, p0/m, z4.h, z0.h, #0 ; CHECK-NEXT: fcmla z26.h, p0/m, z5.h, z1.h, #0 ; CHECK-NEXT: fcmla z27.h, p0/m, z6.h, z2.h, #0 -; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #0 +; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #90 ; CHECK-NEXT: fcmla z25.h, p0/m, z4.h, z0.h, #90 ; CHECK-NEXT: fcmla z26.h, p0/m, z5.h, z1.h, #90 ; CHECK-NEXT: fcmla z27.h, p0/m, z6.h, z2.h, #90 -; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #90 +; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: mov z1.d, z26.d ; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32f16( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll index 0f5e9a2202ddd4..1e2afb78de1b0b 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll @@ -7,8 +7,8 @@ target triple = "aarch64" define @complex_mul_v4f32( %a, %b) { ; CHECK-LABEL: complex_mul_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcmla z2.s, p0/m, z1.s, z0.s, #0 ; CHECK-NEXT: fcmla z2.s, p0/m, z1.s, z0.s, #90 ; CHECK-NEXT: mov z0.d, z2.d @@ -34,15 +34,15 @@ entry: define @complex_mul_v8f32( %a, %b) { ; CHECK-LABEL: complex_mul_v8f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z4.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z5.d, z4.d -; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #0 ; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #0 -; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #90 +; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #0 ; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #90 -; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #90 ; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( %a) @@ -65,23 +65,23 @@ entry: define @complex_mul_v16f32( %a, %b) { ; CHECK-LABEL: complex_mul_v16f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z24.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #0 ; CHECK-NEXT: fcmla z25.s, p0/m, z4.s, z0.s, #0 ; CHECK-NEXT: fcmla z26.s, p0/m, z5.s, z1.s, #0 ; CHECK-NEXT: fcmla z27.s, p0/m, z6.s, z2.s, #0 -; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #0 +; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #90 ; CHECK-NEXT: fcmla z25.s, p0/m, z4.s, z0.s, #90 ; CHECK-NEXT: fcmla z26.s, p0/m, z5.s, z1.s, #90 ; CHECK-NEXT: fcmla z27.s, p0/m, z6.s, z2.s, #90 -; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #90 +; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: mov z1.d, z26.d ; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f32( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll index 1fe554bdc616e6..17a239a09a0339 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll @@ -7,8 +7,8 @@ target triple = "aarch64" define @complex_mul_v2f64( %a, %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z2.d, p0/m, z1.d, z0.d, #0 ; CHECK-NEXT: fcmla z2.d, p0/m, z1.d, z0.d, #90 ; CHECK-NEXT: mov z0.d, z2.d @@ -34,15 +34,15 @@ entry: define @complex_mul_v4f64( %a, %b) { ; CHECK-LABEL: complex_mul_v4f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z4.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z5.d, z4.d -; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z4.d, p0/m, z3.d, z1.d, #0 -; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #90 +; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z4.d, p0/m, z3.d, z1.d, #90 -; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #90 ; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -65,23 +65,23 @@ entry: define @complex_mul_v8f64( %a, %b) { ; CHECK-LABEL: complex_mul_v8f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z3.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z0.d, #0 ; CHECK-NEXT: fcmla z26.d, p0/m, z5.d, z1.d, #0 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z2.d, #0 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z3.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z3.d, #90 ; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z0.d, #90 ; CHECK-NEXT: fcmla z26.d, p0/m, z5.d, z1.d, #90 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z2.d, #90 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z3.d, #90 +; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: mov z1.d, z26.d ; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll index 1b8a21b66ade98..07488b623b98db 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll @@ -16,8 +16,9 @@ define @complex_mul_v4i16( %a, This Inner Loop Header: Depth=1 -; CHECK-NEXT: zip2 p3.d, p1.d, p1.d +; CHECK-NEXT: zip2 p2.d, p1.d, p1.d ; CHECK-NEXT: add x13, x0, x8 ; CHECK-NEXT: add x14, x1, x8 -; CHECK-NEXT: zip1 p2.d, p1.d, p1.d +; CHECK-NEXT: zip1 p1.d, p1.d, p1.d ; CHECK-NEXT: mov z6.d, z1.d ; CHECK-NEXT: mov z7.d, z0.d -; CHECK-NEXT: whilelo p1.d, x12, x9 +; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] ; CHECK-NEXT: add x8, x8, x11 -; CHECK-NEXT: add x12, x12, x10 -; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] -; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] +; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13] +; CHECK-NEXT: ld1d { z5.d }, p1/z, [x14] ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p3/m, z7.d -; CHECK-NEXT: mov z1.d, p2/m, z6.d +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: mov z1.d, p1/m, z6.d +; CHECK-NEXT: whilelo p1.d, x12, x9 +; CHECK-NEXT: add x12, x12, x10 ; CHECK-NEXT: b.mi .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -114,10 +114,10 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr % ; CHECK-LABEL: complex_mul_predicated_v2f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x10 -; CHECK-NEXT: neg x11, x10 ; CHECK-NEXT: mov w12, #100 // =0x64 +; CHECK-NEXT: neg x11, x10 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: and x11, x11, x12 @@ -133,20 +133,20 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr % ; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: add x9, x9, x10 ; CHECK-NEXT: add x8, x8, x12 -; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0 +; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0 ; CHECK-NEXT: cmp x11, x9 -; CHECK-NEXT: zip2 p2.d, p1.d, p1.d -; CHECK-NEXT: zip1 p1.d, p1.d, p1.d -; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13] -; CHECK-NEXT: ld1d { z5.d }, p1/z, [x14] +; CHECK-NEXT: zip2 p1.d, p2.d, p2.d +; CHECK-NEXT: zip1 p2.d, p2.d, p2.d +; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z4.d }, p1/z, [x14, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] +; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: mov z1.d, p1/m, z6.d +; CHECK-NEXT: mov z0.d, p1/m, z7.d +; CHECK-NEXT: mov z1.d, p2/m, z6.d ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -217,8 +217,8 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, ptr %cond) { ; CHECK-LABEL: complex_mul_predicated_x2_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: whilelo p1.d, xzr, x10 ; CHECK-NEXT: mov x8, xzr @@ -236,19 +236,19 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: add x9, x9, x11 ; CHECK-NEXT: add x8, x8, x12 -; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 -; CHECK-NEXT: zip2 p3.d, p1.d, p1.d -; CHECK-NEXT: zip1 p2.d, p1.d, p1.d -; CHECK-NEXT: whilelo p1.d, x9, x10 -; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl] +; CHECK-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; CHECK-NEXT: zip2 p1.d, p2.d, p2.d +; CHECK-NEXT: zip1 p2.d, p2.d, p2.d +; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z4.d }, p1/z, [x14, #1, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] ; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p3/m, z7.d +; CHECK-NEXT: mov z0.d, p1/m, z7.d +; CHECK-NEXT: whilelo p1.d, x9, x10 ; CHECK-NEXT: mov z1.d, p2/m, z6.d ; CHECK-NEXT: b.mi .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index 1696ac8709d406..664d99a3627b58 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -15,11 +15,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: mov w10, #100 // =0x64 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: rdvl x11, #2 @@ -101,18 +101,18 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_nonzero_init_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: fmov d0, #1.00000000 ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: fmov d2, #2.00000000 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: mov w10, #100 // =0x64 -; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: fmov d2, #2.00000000 +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: neg x9, x9 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d ; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: rdvl x11, #2 -; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d ; CHECK-NEXT: mov z1.d, p0/m, z2.d ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: zip2 z0.d, z1.d, z3.d @@ -190,12 +190,12 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64_unrolled: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cntw x9 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: mov w10, #1000 // =0x3e8 +; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: rdvl x12, #2 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d @@ -324,10 +324,10 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia ; CHECK-LABEL: reduction_mix: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z2.d, #0 // =0x0 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: neg x10, x9 ; CHECK-NEXT: mov w11, #100 // =0x64 +; CHECK-NEXT: neg x10, x9 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x10, x11 ; CHECK-NEXT: rdvl x11, #2 @@ -349,8 +349,8 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia ; CHECK-NEXT: uaddv d2, p0, z2.d ; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d ; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d -; CHECK-NEXT: fmov x8, d2 ; CHECK-NEXT: faddv d0, p0, z3.d +; CHECK-NEXT: fmov x8, d2 ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll index 742a7099559f74..17bf5ba6eb48ba 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll @@ -8,8 +8,8 @@ target triple = "aarch64" define @complex_mul_const( %a, %b) { ; CHECK-LABEL: complex_mul_const: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z4.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fmov z7.d, #3.00000000 ; CHECK-NEXT: fmov z24.d, #11.00000000 ; CHECK-NEXT: mov z6.d, z4.d @@ -55,25 +55,25 @@ entry: define @complex_mul_non_const( %a, %b, [2 x double] %c) { ; CHECK-LABEL: complex_mul_non_const: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z6.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d5 killed $d5 def $z5 ; CHECK-NEXT: // kill: def $d4 killed $d4 def $z4 ; CHECK-NEXT: mov z5.d, d5 ; CHECK-NEXT: mov z4.d, d4 ; CHECK-NEXT: mov z24.d, z6.d ; CHECK-NEXT: mov z7.d, z6.d -; CHECK-NEXT: zip2 z25.d, z4.d, z5.d -; CHECK-NEXT: zip1 z4.d, z4.d, z5.d ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: zip2 z1.d, z4.d, z5.d ; CHECK-NEXT: fcmla z7.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: zip1 z2.d, z4.d, z5.d ; CHECK-NEXT: mov z0.d, z6.d -; CHECK-NEXT: fcmla z6.d, p0/m, z24.d, z25.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0 -; CHECK-NEXT: fcmla z6.d, p0/m, z24.d, z25.d, #90 -; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90 +; CHECK-NEXT: fcmla z6.d, p0/m, z24.d, z1.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z2.d, #0 +; CHECK-NEXT: fcmla z6.d, p0/m, z24.d, z1.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z2.d, #90 ; CHECK-NEXT: mov z1.d, z6.d ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll index 1c254f9ed935de..e6d5a2ac0fd79a 100644 --- a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll +++ b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll @@ -96,8 +96,8 @@ entry: define void @test_concat_fptrunc_v4f64_to_v4f32(ptr %ptr) #1 { ; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fmov z0.s, #1.00000000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll index 4a2e85c715f7a2..83c7f73800af19 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll @@ -9,8 +9,8 @@ define fastcc i8 @allocno_reload_assign() { ; CHECK-LABEL: allocno_reload_assign: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z16.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: uunpklo z1.h, z0.b ; CHECK-NEXT: uunpkhi z0.h, z0.b @@ -48,12 +48,12 @@ define fastcc i8 @allocno_reload_assign() { ; CHECK-NEXT: punpklo p4.h, p3.b ; CHECK-NEXT: punpkhi p3.h, p3.b ; CHECK-NEXT: st1b { z2.d }, p4, [z16.d] -; CHECK-NEXT: punpklo p4.h, p2.b -; CHECK-NEXT: punpkhi p2.h, p2.b ; CHECK-NEXT: st1b { z3.d }, p3, [z16.d] -; CHECK-NEXT: punpklo p3.h, p4.b -; CHECK-NEXT: st1b { z4.d }, p3, [z16.d] -; CHECK-NEXT: punpkhi p3.h, p4.b +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: st1b { z4.d }, p4, [z16.d] ; CHECK-NEXT: st1b { z5.d }, p3, [z16.d] ; CHECK-NEXT: punpklo p3.h, p2.b ; CHECK-NEXT: punpkhi p2.h, p2.b diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll index 49ad3ae7d62907..6e13ae6feb66b4 100644 --- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll @@ -228,13 +228,13 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw ; ALL-NEXT: sdiv x9, x9, x8 ; ALL-NEXT: sdiv x11, x11, x10 ; ALL-NEXT: mul x8, x9, x8 +; ALL-NEXT: fmov d2, x9 ; ALL-NEXT: fmov d1, x8 ; ALL-NEXT: mul x10, x11, x10 +; ALL-NEXT: mov v2.d[1], x11 +; ALL-NEXT: str q2, [x0] ; ALL-NEXT: mov v1.d[1], x10 ; ALL-NEXT: sub v0.2d, v0.2d, v1.2d -; ALL-NEXT: fmov d1, x9 -; ALL-NEXT: mov v1.d[1], x11 -; ALL-NEXT: str q1, [x0] ; ALL-NEXT: ret %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, ptr %divdst, align 16 diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll index 3bc50b2f03d83d..650219e03b8a71 100644 --- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll @@ -228,13 +228,13 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw ; ALL-NEXT: udiv x9, x9, x8 ; ALL-NEXT: udiv x11, x11, x10 ; ALL-NEXT: mul x8, x9, x8 +; ALL-NEXT: fmov d2, x9 ; ALL-NEXT: fmov d1, x8 ; ALL-NEXT: mul x10, x11, x10 +; ALL-NEXT: mov v2.d[1], x11 +; ALL-NEXT: str q2, [x0] ; ALL-NEXT: mov v1.d[1], x10 ; ALL-NEXT: sub v0.2d, v0.2d, v1.2d -; ALL-NEXT: fmov d1, x9 -; ALL-NEXT: mov v1.d[1], x11 -; ALL-NEXT: str q1, [x0] ; ALL-NEXT: ret %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr %divdst, align 16 diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index dff4831330deb0..bd9d9b99622e34 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -502,9 +502,9 @@ define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b ; CHECK-NEXT: ld1 { v6.s }[1], [x7], #4 ; CHECK-NEXT: ld1 { v7.s }[1], [x7] -; CHECK-NEXT: usubl v5.8h, v6.8b, v7.8b ; CHECK-NEXT: shll v0.4s, v4.4h, #16 ; CHECK-NEXT: shll2 v4.4s, v4.8h, #16 +; CHECK-NEXT: usubl v5.8h, v6.8b, v7.8b ; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: saddw2 v1.4s, v4.4s, v1.8h ; CHECK-NEXT: shll v6.4s, v5.4h, #16 @@ -647,10 +647,10 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s1, [x0] ; CHECK-NEXT: add x8, x3, #8 -; CHECK-NEXT: add x11, x3, #12 +; CHECK-NEXT: add x11, x1, #12 ; CHECK-NEXT: str s1, [x4] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ldp s0, s4, [x2] +; CHECK-NEXT: ldr s0, [x2] ; CHECK-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-NEXT: umov w9, v2.h[0] ; CHECK-NEXT: umov w10, v2.h[1] @@ -664,31 +664,32 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: add x9, x1, #4 ; CHECK-NEXT: mov v1.d[1], v2.d[0] ; CHECK-NEXT: mov v0.b[11], w10 -; CHECK-NEXT: add x10, x1, #12 +; CHECK-NEXT: add x10, x3, #12 ; CHECK-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4 -; CHECK-NEXT: ldr s3, [x0, #12] -; CHECK-NEXT: ldp s2, s7, [x0, #4] -; CHECK-NEXT: ld1 { v4.s }[1], [x3] -; CHECK-NEXT: ldp s5, s6, [x2, #8] -; CHECK-NEXT: ld1 { v3.s }[1], [x10] -; CHECK-NEXT: ld1 { v2.s }[1], [x9] -; CHECK-NEXT: ld1 { v5.s }[1], [x8] -; CHECK-NEXT: ld1 { v6.s }[1], [x11] +; CHECK-NEXT: ldr s5, [x0, #4] +; CHECK-NEXT: ldp s2, s3, [x2, #4] +; CHECK-NEXT: ldr s7, [x2, #12] +; CHECK-NEXT: ldp s6, s4, [x0, #8] +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: ld1 { v7.s }[1], [x10] +; CHECK-NEXT: ld1 { v3.s }[1], [x8] +; CHECK-NEXT: ld1 { v2.s }[1], [x3] ; CHECK-NEXT: add x8, x1, #8 -; CHECK-NEXT: ld1 { v7.s }[1], [x8] -; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b -; CHECK-NEXT: ushll v3.8h, v5.8b, #0 -; CHECK-NEXT: uaddl v4.8h, v4.8b, v6.8b -; CHECK-NEXT: uaddw v1.8h, v1.8h, v7.8b +; CHECK-NEXT: ld1 { v4.s }[1], [x11] +; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: uaddl v2.8h, v2.8b, v7.8b +; CHECK-NEXT: uaddl v4.8h, v5.8b, v4.8b +; CHECK-NEXT: uaddw v1.8h, v1.8h, v6.8b ; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b -; CHECK-NEXT: ushll v0.4s, v2.4h, #3 +; CHECK-NEXT: ushll v6.4s, v2.4h, #3 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: ushll v6.4s, v4.4h, #3 +; CHECK-NEXT: ushll v0.4s, v4.4h, #3 ; CHECK-NEXT: ushll2 v3.4s, v4.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h -; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v5.8h +; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v1.8h +; CHECK-NEXT: uaddw2 v3.4s, v2.4s, v5.8h ; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p @@ -762,35 +763,35 @@ define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: add x8, x1, #8 ; CHECK-NEXT: ldr s6, [x1, #12] ; CHECK-NEXT: ldp s17, s18, [x2, #8] -; CHECK-NEXT: ldp s2, s3, [x2] +; CHECK-NEXT: ldp s3, s5, [x2] ; CHECK-NEXT: add x9, x3, #8 ; CHECK-NEXT: mov v4.16b, v1.16b ; CHECK-NEXT: ldp s7, s16, [x0] -; CHECK-NEXT: ldr s5, [x3, #12] +; CHECK-NEXT: ldr s2, [x3, #12] ; CHECK-NEXT: mov v1.s[1], v6.s[0] -; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v3.s }[1], [x3], #4 ; CHECK-NEXT: mov v4.s[1], v6.s[0] ; CHECK-NEXT: ld1 { v7.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v16.s }[1], [x1] -; CHECK-NEXT: ld1 { v3.s }[1], [x3] -; CHECK-NEXT: ld1 { v0.s }[1], [x8] +; CHECK-NEXT: ld1 { v5.s }[1], [x3] ; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ld1 { v0.s }[1], [x8] ; CHECK-NEXT: mov v4.s[2], v18.s[0] -; CHECK-NEXT: mov v18.s[1], v5.s[0] +; CHECK-NEXT: mov v18.s[1], v2.s[0] ; CHECK-NEXT: uaddl v1.8h, v16.8b, v1.8b ; CHECK-NEXT: uaddl v6.8h, v7.8b, v0.8b -; CHECK-NEXT: uaddl v2.8h, v2.8b, v17.8b -; CHECK-NEXT: uaddl v3.8h, v3.8b, v18.8b +; CHECK-NEXT: uaddl v7.8h, v3.8b, v17.8b ; CHECK-NEXT: ushll v0.4s, v1.4h, #3 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 -; CHECK-NEXT: mov v4.s[3], v5.s[0] +; CHECK-NEXT: uaddl v5.8h, v5.8b, v18.8b +; CHECK-NEXT: mov v4.s[3], v2.s[0] ; CHECK-NEXT: uaddw v0.4s, v0.4s, v6.4h ; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v6.8h -; CHECK-NEXT: ushll v7.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 +; CHECK-NEXT: ushll v16.4s, v5.4h, #3 +; CHECK-NEXT: ushll2 v3.4s, v5.8h, #3 ; CHECK-NEXT: str q4, [x4] -; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h -; CHECK-NEXT: uaddw v2.4s, v7.4s, v2.4h +; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v7.8h +; CHECK-NEXT: uaddw v2.4s, v16.4s, v7.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -873,8 +874,8 @@ define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v4.s }[1], [x11] ; CHECK-NEXT: ld1 { v2.s }[1], [x3] ; CHECK-NEXT: ld1 { v0.s }[1], [x10] -; CHECK-NEXT: ld1 { v7.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: ld1 { v7.s }[1], [x9] ; CHECK-NEXT: uaddl v5.8h, v5.8b, v4.8b ; CHECK-NEXT: uaddl v2.8h, v2.8b, v0.8b ; CHECK-NEXT: ushll v16.8h, v0.8b, #0 @@ -972,8 +973,8 @@ define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v5.s }[1], [x11] ; CHECK-NEXT: ld1 { v3.s }[1], [x3] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] -; CHECK-NEXT: ld1 { v4.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: ld1 { v4.s }[1], [x9] ; CHECK-NEXT: uaddl v5.8h, v1.8b, v5.8b ; CHECK-NEXT: uaddl v7.8h, v3.8b, v7.8b ; CHECK-NEXT: uaddl v1.8h, v0.8b, v4.8b @@ -1072,23 +1073,23 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v6.s }[1], [x9] ; CHECK-NEXT: ld1 { v4.s }[1], [x8] ; CHECK-NEXT: uaddl v7.8h, v3.8b, v7.8b -; CHECK-NEXT: uaddl v3.8h, v1.8b, v5.8b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b ; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b ; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b ; CHECK-NEXT: ushll v0.4s, v7.4h, #3 -; CHECK-NEXT: ushll2 v1.4s, v7.8h, #3 -; CHECK-NEXT: ushll v5.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v6.4s, v3.8h, #3 -; CHECK-NEXT: ushll2 v16.4s, v3.8h, #0 -; CHECK-NEXT: ushll v17.4s, v3.4h, #0 -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h +; CHECK-NEXT: ushll2 v3.4s, v7.8h, #3 +; CHECK-NEXT: ushll v6.4s, v1.4h, #3 +; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3 +; CHECK-NEXT: ushll2 v5.4s, v1.8h, #0 +; CHECK-NEXT: ushll v17.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v18.4s, v7.8h, #0 +; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v2.8h ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: uaddw v2.4s, v5.4s, v4.4h -; CHECK-NEXT: uaddw2 v3.4s, v6.4s, v4.8h -; CHECK-NEXT: ushll2 v4.4s, v7.8h, #0 -; CHECK-NEXT: ushll v5.4s, v7.4h, #0 -; CHECK-NEXT: stp q17, q16, [x4, #32] -; CHECK-NEXT: stp q5, q4, [x4] +; CHECK-NEXT: uaddw v2.4s, v6.4s, v4.4h +; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v4.8h +; CHECK-NEXT: ushll v4.4s, v7.4h, #0 +; CHECK-NEXT: stp q17, q5, [x4, #32] +; CHECK-NEXT: stp q4, q18, [x4] ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -1157,32 +1158,32 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_shl: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldp s1, s2, [x0] ; CHECK-NEXT: add x10, x3, #12 -; CHECK-NEXT: ldp s2, s3, [x2] +; CHECK-NEXT: ldp s0, s3, [x2] ; CHECK-NEXT: add x11, x1, #12 ; CHECK-NEXT: ldp s4, s5, [x0, #8] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: ldp s6, s7, [x2, #8] ; CHECK-NEXT: add x9, x1, #8 -; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v2.s }[1], [x1] ; CHECK-NEXT: ld1 { v5.s }[1], [x11] ; CHECK-NEXT: ld1 { v3.s }[1], [x3] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] ; CHECK-NEXT: ld1 { v4.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] -; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b +; CHECK-NEXT: uaddl v2.8h, v2.8b, v5.8b ; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b -; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b -; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b -; CHECK-NEXT: ushll v5.4s, v1.4h, #3 +; CHECK-NEXT: uaddl v4.8h, v1.8b, v4.8b +; CHECK-NEXT: ushll v5.4s, v2.4h, #3 +; CHECK-NEXT: ushll2 v7.4s, v2.8h, #3 +; CHECK-NEXT: uaddl v2.8h, v0.8b, v6.8b ; CHECK-NEXT: ushll v6.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v7.4s, v1.8h, #3 ; CHECK-NEXT: ushll2 v16.4s, v3.8h, #3 -; CHECK-NEXT: uaddw v0.4s, v5.4s, v4.4h ; CHECK-NEXT: uaddw2 v1.4s, v7.4s, v4.8h +; CHECK-NEXT: uaddw v0.4s, v5.4s, v4.4h ; CHECK-NEXT: stp q5, q7, [x4] ; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v2.8h ; CHECK-NEXT: uaddw v2.4s, v6.4s, v2.4h diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll index 9916aeeab1cad1..b1ca88975a6218 100644 --- a/llvm/test/CodeGen/AArch64/fcmp.ll +++ b/llvm/test/CodeGen/AArch64/fcmp.ll @@ -280,10 +280,10 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x ; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: fmov s2, w8 ; CHECK-GI-NEXT: mov v2.s[1], w8 -; CHECK-GI-NEXT: neg v3.4s, v1.4s ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: neg v1.4s, v1.4s ; CHECK-GI-NEXT: mov v2.s[2], w8 -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: eor v1.16b, v0.16b, v2.16b ; CHECK-GI-NEXT: and v0.16b, v6.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v7.16b, v1.16b @@ -348,10 +348,10 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d, ; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: neg v5.4s, v4.4s ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: neg v4.4s, v4.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s ; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v5.4s ; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b @@ -426,10 +426,10 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i ; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: neg v5.4s, v4.4s ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: neg v4.4s, v4.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s ; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v5.4s ; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b @@ -545,8 +545,8 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x ; CHECK-GI-NOFP16-LABEL: v7f16_half: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: mov w8, #15 // =0xf -; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h7, v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[4] +; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[5] ; CHECK-GI-NOFP16-NEXT: fmov s4, w8 ; CHECK-GI-NOFP16-NEXT: mov h16, v1.h[4] ; CHECK-GI-NOFP16-NEXT: mov w8, #65535 // =0xffff @@ -555,32 +555,32 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x ; CHECK-GI-NOFP16-NEXT: mov h19, v1.h[6] ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov v5.16b, v4.16b -; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0] -; CHECK-GI-NOFP16-NEXT: fmov s7, w8 +; CHECK-GI-NOFP16-NEXT: mov v7.16b, v4.16b +; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v6.h[0] +; CHECK-GI-NOFP16-NEXT: fmov s6, w8 ; CHECK-GI-NOFP16-NEXT: mov v16.h[1], v17.h[0] -; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.16b, v7.16b +; CHECK-GI-NOFP16-NEXT: mov v7.h[1], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.16b, v6.16b ; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v18.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[1], v7.h[0] +; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v18.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.h[1], v6.h[0] ; CHECK-GI-NOFP16-NEXT: mov v16.h[2], v19.h[0] -; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v6.4h -; CHECK-GI-NOFP16-NEXT: mov v17.h[2], v7.h[0] +; CHECK-GI-NOFP16-NEXT: mov v7.h[2], v4.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h +; CHECK-GI-NOFP16-NEXT: mov v17.h[2], v6.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v16.4s, v16.4h -; CHECK-GI-NOFP16-NEXT: mov v5.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[3], v7.h[0] -; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v16.4s, v6.4s -; CHECK-GI-NOFP16-NEXT: mov v5.h[4], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[4], v7.h[0] +; CHECK-GI-NOFP16-NEXT: mov v7.h[3], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.h[3], v6.h[0] +; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v16.4s, v5.4s +; CHECK-GI-NOFP16-NEXT: mov v7.h[4], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.h[4], v6.h[0] ; CHECK-GI-NOFP16-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-GI-NOFP16-NEXT: mov v5.h[5], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[5], v7.h[0] -; CHECK-GI-NOFP16-NEXT: mov v5.h[6], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[6], v7.h[0] -; CHECK-GI-NOFP16-NEXT: neg v1.8h, v5.8h -; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v5.8h +; CHECK-GI-NOFP16-NEXT: mov v7.h[5], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.h[5], v6.h[0] +; CHECK-GI-NOFP16-NEXT: mov v7.h[6], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.h[6], v6.h[0] +; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v7.8h +; CHECK-GI-NOFP16-NEXT: neg v1.8h, v7.8h ; CHECK-GI-NOFP16-NEXT: sshl v0.8h, v0.8h, v1.8h ; CHECK-GI-NOFP16-NEXT: eor v1.16b, v0.16b, v17.16b ; CHECK-GI-NOFP16-NEXT: and v0.16b, v2.16b, v0.16b @@ -609,8 +609,8 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x ; CHECK-GI-FP16-NEXT: mov v7.h[5], v6.h[0] ; CHECK-GI-FP16-NEXT: mov v5.h[6], v4.h[0] ; CHECK-GI-FP16-NEXT: mov v7.h[6], v6.h[0] -; CHECK-GI-FP16-NEXT: neg v1.8h, v5.8h ; CHECK-GI-FP16-NEXT: ushl v0.8h, v0.8h, v5.8h +; CHECK-GI-FP16-NEXT: neg v1.8h, v5.8h ; CHECK-GI-FP16-NEXT: sshl v0.8h, v0.8h, v1.8h ; CHECK-GI-FP16-NEXT: eor v1.16b, v0.16b, v7.16b ; CHECK-GI-FP16-NEXT: and v0.16b, v2.16b, v0.16b @@ -1047,6 +1047,7 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[6] ; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[6] ; CHECK-GI-NOFP16-NEXT: fmov s16, w0 +; CHECK-GI-NOFP16-NEXT: fmov s18, w4 ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] @@ -1054,6 +1055,7 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0] ; CHECK-GI-NOFP16-NEXT: ldr s5, [sp] ; CHECK-GI-NOFP16-NEXT: mov v16.s[1], w1 +; CHECK-GI-NOFP16-NEXT: mov v18.s[1], w5 ; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w8 ; CHECK-GI-NOFP16-NEXT: fmov w9, s5 ; CHECK-GI-NOFP16-NEXT: fmov s5, w7 @@ -1069,27 +1071,25 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w8 ; CHECK-GI-NOFP16-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v17.s[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #40] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v18.s[2], w6 ; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: mov v16.s[3], w3 ; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w9 -; CHECK-GI-NOFP16-NEXT: neg v18.4s, v3.4s ; CHECK-GI-NOFP16-NEXT: mov v7.s[2], v17.s[0] ; CHECK-GI-NOFP16-NEXT: fcmgt v2.4s, v4.4s, v2.4s ; CHECK-GI-NOFP16-NEXT: fmov s4, w8 ; CHECK-GI-NOFP16-NEXT: mov v4.s[1], w8 ; CHECK-GI-NOFP16-NEXT: ushl v2.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: fmov s3, w4 -; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w5 +; CHECK-GI-NOFP16-NEXT: neg v3.4s, v3.4s ; CHECK-GI-NOFP16-NEXT: mov v4.s[2], w8 -; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v18.4s +; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v3.4s ; CHECK-GI-NOFP16-NEXT: fmov w8, s6 -; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w6 -; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v4.16b ; CHECK-GI-NOFP16-NEXT: mov v5.s[3], w8 +; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v4.16b +; CHECK-GI-NOFP16-NEXT: and v2.16b, v18.16b, v2.16b ; CHECK-GI-NOFP16-NEXT: and v1.16b, v7.16b, v1.16b -; CHECK-GI-NOFP16-NEXT: and v2.16b, v3.16b, v2.16b ; CHECK-GI-NOFP16-NEXT: bsl v0.16b, v16.16b, v5.16b ; CHECK-GI-NOFP16-NEXT: orr v1.16b, v2.16b, v1.16b ; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1] @@ -1111,30 +1111,32 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h ; CHECK-GI-FP16-NEXT: mov w10, #31 // =0x1f ; CHECK-GI-FP16-NEXT: ldr s3, [sp] -; CHECK-GI-FP16-NEXT: fmov s1, w10 +; CHECK-GI-FP16-NEXT: fmov s2, w10 ; CHECK-GI-FP16-NEXT: fmov s6, w0 ; CHECK-GI-FP16-NEXT: ldr s4, [sp, #8] +; CHECK-GI-FP16-NEXT: fmov s17, w4 ; CHECK-GI-FP16-NEXT: ldr s7, [sp, #24] ; CHECK-GI-FP16-NEXT: ldr s16, [sp, #32] ; CHECK-GI-FP16-NEXT: umov w8, v0.h[4] ; CHECK-GI-FP16-NEXT: umov w9, v0.h[5] -; CHECK-GI-FP16-NEXT: mov v1.s[1], w10 +; CHECK-GI-FP16-NEXT: mov v2.s[1], w10 ; CHECK-GI-FP16-NEXT: mov v6.s[1], w1 +; CHECK-GI-FP16-NEXT: mov v17.s[1], w5 ; CHECK-GI-FP16-NEXT: mov v7.s[1], v16.s[0] ; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40] -; CHECK-GI-FP16-NEXT: fmov s2, w8 +; CHECK-GI-FP16-NEXT: fmov s1, w8 ; CHECK-GI-FP16-NEXT: umov w8, v0.h[6] -; CHECK-GI-FP16-NEXT: mov v1.s[2], w10 +; CHECK-GI-FP16-NEXT: mov v2.s[2], w10 ; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-FP16-NEXT: mov v6.s[2], w2 +; CHECK-GI-FP16-NEXT: mov v17.s[2], w6 ; CHECK-GI-FP16-NEXT: mov v7.s[2], v16.s[0] -; CHECK-GI-FP16-NEXT: mov v2.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v1.s[1], w9 ; CHECK-GI-FP16-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-GI-FP16-NEXT: fmov s5, w9 -; CHECK-GI-FP16-NEXT: neg v17.4s, v1.4s ; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-GI-FP16-NEXT: mov v6.s[3], w3 -; CHECK-GI-FP16-NEXT: mov v2.s[2], w8 +; CHECK-GI-FP16-NEXT: mov v1.s[2], w8 ; CHECK-GI-FP16-NEXT: fmov w8, s3 ; CHECK-GI-FP16-NEXT: fmov s3, w7 ; CHECK-GI-FP16-NEXT: mov v5.s[1], w9 @@ -1142,26 +1144,24 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-FP16-NEXT: mov v3.s[1], w8 ; CHECK-GI-FP16-NEXT: fmov w8, s4 ; CHECK-GI-FP16-NEXT: ldr s4, [sp, #16] -; CHECK-GI-FP16-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-GI-FP16-NEXT: fmov s2, w4 +; CHECK-GI-FP16-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-GI-FP16-NEXT: neg v2.4s, v2.4s ; CHECK-GI-FP16-NEXT: mov v5.s[2], w9 -; CHECK-GI-FP16-NEXT: mov v2.s[1], w5 ; CHECK-GI-FP16-NEXT: mov v3.s[2], w8 -; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v17.4s +; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v2.4s ; CHECK-GI-FP16-NEXT: fmov w8, s4 -; CHECK-GI-FP16-NEXT: eor v4.16b, v1.16b, v5.16b -; CHECK-GI-FP16-NEXT: mov v2.s[2], w6 +; CHECK-GI-FP16-NEXT: eor v2.16b, v1.16b, v5.16b +; CHECK-GI-FP16-NEXT: and v1.16b, v17.16b, v1.16b ; CHECK-GI-FP16-NEXT: mov v3.s[3], w8 -; CHECK-GI-FP16-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-GI-FP16-NEXT: and v2.16b, v7.16b, v4.16b +; CHECK-GI-FP16-NEXT: and v2.16b, v7.16b, v2.16b ; CHECK-GI-FP16-NEXT: bsl v0.16b, v6.16b, v3.16b ; CHECK-GI-FP16-NEXT: orr v1.16b, v1.16b, v2.16b ; CHECK-GI-FP16-NEXT: mov s2, v0.s[1] ; CHECK-GI-FP16-NEXT: mov s3, v0.s[2] ; CHECK-GI-FP16-NEXT: mov s4, v0.s[3] -; CHECK-GI-FP16-NEXT: fmov w0, s0 ; CHECK-GI-FP16-NEXT: mov s5, v1.s[1] ; CHECK-GI-FP16-NEXT: mov s6, v1.s[2] +; CHECK-GI-FP16-NEXT: fmov w0, s0 ; CHECK-GI-FP16-NEXT: fmov w4, s1 ; CHECK-GI-FP16-NEXT: fmov w1, s2 ; CHECK-GI-FP16-NEXT: fmov w2, s3 diff --git a/llvm/test/CodeGen/AArch64/fdiv-combine.ll b/llvm/test/CodeGen/AArch64/fdiv-combine.ll index 1ed63f3ef25077..0627250d077912 100644 --- a/llvm/test/CodeGen/AArch64/fdiv-combine.ll +++ b/llvm/test/CodeGen/AArch64/fdiv-combine.ll @@ -171,8 +171,8 @@ entry: define @splat_fdiv_nxv2f64(double %D, %a) #1 { ; CHECK-LABEL: splat_fdiv_nxv2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: fdivr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll index 03e64f8b785b04..a78addc490086d 100644 --- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll @@ -604,8 +604,8 @@ define fastcc i1 @quantum_hadamard(i32 %0) { define @fdiv_pow2_nx4xfloat( %i) "target-features"="+sve" { ; CHECK-LABEL: fdiv_pow2_nx4xfloat: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: lslr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fmov z1.s, #9.00000000 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s diff --git a/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll b/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll index 67c056c780cc80..2c8e2190f8209f 100644 --- a/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll +++ b/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll @@ -62,9 +62,9 @@ define @frem_nxv4f32( %unused, @frem_nxv4f32( %unused, @frem_strict_nxv2f64( %unused, ; ARMPL-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; ARMPL-NEXT: .cfi_def_cfa_offset 16 ; ARMPL-NEXT: .cfi_offset w30, -16 -; ARMPL-NEXT: ptrue p0.d ; ARMPL-NEXT: mov z0.d, z1.d ; ARMPL-NEXT: mov z1.d, z2.d +; ARMPL-NEXT: ptrue p0.d ; ARMPL-NEXT: bl armpl_svfmod_f64_x ; ARMPL-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; ARMPL-NEXT: ret @@ -102,9 +102,9 @@ define @frem_strict_nxv2f64( %unused, ; SLEEF-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; SLEEF-NEXT: .cfi_def_cfa_offset 16 ; SLEEF-NEXT: .cfi_offset w30, -16 -; SLEEF-NEXT: ptrue p0.d ; SLEEF-NEXT: mov z0.d, z1.d ; SLEEF-NEXT: mov z1.d, z2.d +; SLEEF-NEXT: ptrue p0.d ; SLEEF-NEXT: bl _ZGVsMxvv_fmod ; SLEEF-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; SLEEF-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll index 301d28fd7be56b..2ea581359af6fd 100644 --- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll @@ -194,10 +194,10 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fcvtzs v0.2d, v0.2d ; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-NEXT: smax v0.2s, v0.2s, v2.2s ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -833,10 +833,10 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fcvtzs v0.2d, v0.2d ; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-NEXT: smax v0.2s, v0.2s, v2.2s ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 92fd3183393ea7..c45885a38f1592 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -697,9 +697,9 @@ define <2 x i1> @test_signed_v2f32_v2i1(<2 x float> %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: fcvtzs v0.2s, v0.2s +; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff ; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-NEXT: smax v0.2s, v0.2s, v2.2s ; CHECK-NEXT: ret %x = call <2 x i1> @llvm.fptosi.sat.v2f32.v2i1(<2 x float> %f) ret <2 x i1> %x @@ -1620,9 +1620,9 @@ define <4 x i1> @test_signed_v4f16_v4i1(<4 x half> %f) { ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: movi v1.2d, #0000000000000000 ; CHECK-FP16-NEXT: fcvtzs v0.4h, v0.4h +; CHECK-FP16-NEXT: movi v2.2d, #0xffffffffffffffff ; CHECK-FP16-NEXT: smin v0.4h, v0.4h, v1.4h -; CHECK-FP16-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v1.4h +; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v2.4h ; CHECK-FP16-NEXT: ret %x = call <4 x i1> @llvm.fptosi.sat.v4f16.v4i1(<4 x half> %f) ret <4 x i1> %x @@ -1668,9 +1668,9 @@ define <4 x i13> @test_signed_v4f16_v4i13(<4 x half> %f) { ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: fcvtzs v0.4h, v0.4h ; CHECK-FP16-NEXT: mvni v1.4h, #240, lsl #8 +; CHECK-FP16-NEXT: movi v2.4h, #240, lsl #8 ; CHECK-FP16-NEXT: smin v0.4h, v0.4h, v1.4h -; CHECK-FP16-NEXT: movi v1.4h, #240, lsl #8 -; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v1.4h +; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v2.4h ; CHECK-FP16-NEXT: ret %x = call <4 x i13> @llvm.fptosi.sat.v4f16.v4i13(<4 x half> %f) ret <4 x i13> %x @@ -2103,9 +2103,9 @@ define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) { ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: movi v1.2d, #0000000000000000 ; CHECK-FP16-NEXT: fcvtzs v0.8h, v0.8h +; CHECK-FP16-NEXT: movi v2.2d, #0xffffffffffffffff ; CHECK-FP16-NEXT: smin v0.8h, v0.8h, v1.8h -; CHECK-FP16-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v1.8h +; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v2.8h ; CHECK-FP16-NEXT: xtn v0.8b, v0.8h ; CHECK-FP16-NEXT: ret %x = call <8 x i1> @llvm.fptosi.sat.v8f16.v8i1(<8 x half> %f) @@ -2254,9 +2254,9 @@ define <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) { ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: fcvtzs v0.8h, v0.8h ; CHECK-FP16-NEXT: mvni v1.8h, #240, lsl #8 +; CHECK-FP16-NEXT: movi v2.8h, #240, lsl #8 ; CHECK-FP16-NEXT: smin v0.8h, v0.8h, v1.8h -; CHECK-FP16-NEXT: movi v1.8h, #240, lsl #8 -; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v1.8h +; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v2.8h ; CHECK-FP16-NEXT: ret %x = call <8 x i13> @llvm.fptosi.sat.v8f16.v8i13(<8 x half> %f) ret <8 x i13> %x diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll index 181f2185893e43..d39c09524e1ad6 100644 --- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll @@ -78,9 +78,9 @@ define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.4s, #31 ; CHECK-NEXT: neg v3.4s, v1.4s +; CHECK-NEXT: and v3.16b, v3.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: neg v2.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s ; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s ; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll index 97511639ec8cf8..cb9f04a7fac48a 100644 --- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -202,8 +202,8 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: vec_4xi32_nonsplat_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll index e7352fe03d01a8..8e10847e7aae34 100644 --- a/llvm/test/CodeGen/AArch64/icmp.ll +++ b/llvm/test/CodeGen/AArch64/icmp.ll @@ -179,10 +179,10 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32> ; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: neg v5.4s, v4.4s ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: neg v4.4s, v4.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s ; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v5.4s ; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll index e4d2b516b8fbfe..0b730f6e771560 100644 --- a/llvm/test/CodeGen/AArch64/insert-extend.ll +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -64,104 +64,104 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca ; CHECK-NEXT: ldr d3, [x11] ; CHECK-NEXT: ldr d4, [x10, x8] ; CHECK-NEXT: ldr d5, [x11, x9] +; CHECK-NEXT: shll2 v6.4s, v0.8h, #16 ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b ; CHECK-NEXT: usubl v3.8h, v4.8b, v5.8b -; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 -; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 -; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: shll2 v4.4s, v1.8h, #16 +; CHECK-NEXT: saddw v0.4s, v6.4s, v0.4h ; CHECK-NEXT: shll2 v6.4s, v2.8h, #16 -; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: shll2 v4.4s, v3.8h, #16 -; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h -; CHECK-NEXT: saddw v3.4s, v4.4s, v3.4h +; CHECK-NEXT: shll2 v5.4s, v3.8h, #16 +; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h ; CHECK-NEXT: rev64 v4.4s, v0.4s +; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h +; CHECK-NEXT: saddw v3.4s, v5.4s, v3.4h ; CHECK-NEXT: rev64 v5.4s, v1.4s ; CHECK-NEXT: rev64 v6.4s, v2.4s -; CHECK-NEXT: rev64 v7.4s, v3.4s ; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s ; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s +; CHECK-NEXT: rev64 v7.4s, v3.4s ; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s ; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s ; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s ; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s -; CHECK-NEXT: ext v1.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s ; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s ; CHECK-NEXT: mov v6.s[1], v7.s[0] ; CHECK-NEXT: ext v7.16b, v5.16b, v16.16b, #8 ; CHECK-NEXT: mov v5.s[3], v4.s[2] -; CHECK-NEXT: uzp1 v4.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v0.4s +; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8 ; CHECK-NEXT: mov v6.d[1], v7.d[1] ; CHECK-NEXT: mov v3.d[1], v5.d[1] +; CHECK-NEXT: uzp1 v1.4s, v4.4s, v0.4s +; CHECK-NEXT: uzp2 v4.4s, v4.4s, v0.4s ; CHECK-NEXT: uzp2 v5.4s, v2.4s, v0.4s ; CHECK-NEXT: uzp1 v0.4s, v2.4s, v0.4s -; CHECK-NEXT: sub v1.4s, v4.4s, v1.4s ; CHECK-NEXT: add v2.4s, v3.4s, v6.4s ; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s ; CHECK-NEXT: rev64 v4.4s, v2.4s ; CHECK-NEXT: rev64 v5.4s, v3.4s -; CHECK-NEXT: addp v16.4s, v1.4s, v3.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s ; CHECK-NEXT: rev64 v7.4s, v0.4s +; CHECK-NEXT: addp v16.4s, v1.4s, v3.4s ; CHECK-NEXT: addp v17.4s, v0.4s, v2.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s ; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s ; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s ; CHECK-NEXT: zip1 v18.4s, v17.4s, v17.4s -; CHECK-NEXT: ext v6.16b, v1.16b, v16.16b, #8 ; CHECK-NEXT: ext v4.16b, v17.16b, v2.16b, #4 ; CHECK-NEXT: ext v5.16b, v16.16b, v3.16b, #4 ; CHECK-NEXT: mov v20.16b, v3.16b +; CHECK-NEXT: ext v6.16b, v1.16b, v16.16b, #8 ; CHECK-NEXT: ext v7.16b, v0.16b, v17.16b, #4 ; CHECK-NEXT: mov v21.16b, v2.16b ; CHECK-NEXT: trn2 v0.4s, v18.4s, v0.4s -; CHECK-NEXT: ext v19.16b, v6.16b, v1.16b, #4 -; CHECK-NEXT: mov v1.s[2], v16.s[1] ; CHECK-NEXT: mov v20.s[2], v16.s[3] ; CHECK-NEXT: zip2 v4.4s, v4.4s, v17.4s ; CHECK-NEXT: zip2 v5.4s, v5.4s, v16.4s ; CHECK-NEXT: mov v21.s[2], v17.s[3] +; CHECK-NEXT: ext v19.16b, v6.16b, v1.16b, #4 ; CHECK-NEXT: ext v7.16b, v7.16b, v7.16b, #4 -; CHECK-NEXT: mov v18.16b, v1.16b +; CHECK-NEXT: mov v1.s[2], v16.s[1] ; CHECK-NEXT: ext v2.16b, v2.16b, v4.16b, #12 ; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #12 ; CHECK-NEXT: uzp2 v4.4s, v6.4s, v19.4s ; CHECK-NEXT: mov v5.16b, v7.16b ; CHECK-NEXT: mov v6.16b, v20.16b +; CHECK-NEXT: mov v18.16b, v1.16b ; CHECK-NEXT: mov v19.16b, v21.16b -; CHECK-NEXT: mov v18.s[1], v16.s[0] ; CHECK-NEXT: sub v7.4s, v0.4s, v7.4s ; CHECK-NEXT: mov v6.s[1], v16.s[2] ; CHECK-NEXT: mov v5.s[0], v17.s[1] +; CHECK-NEXT: mov v18.s[1], v16.s[0] ; CHECK-NEXT: mov v19.s[1], v17.s[2] ; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s ; CHECK-NEXT: sub v16.4s, v20.4s, v3.4s ; CHECK-NEXT: sub v17.4s, v21.4s, v2.4s -; CHECK-NEXT: add v4.4s, v18.4s, v4.4s ; CHECK-NEXT: add v3.4s, v6.4s, v3.4s ; CHECK-NEXT: add v0.4s, v0.4s, v5.4s +; CHECK-NEXT: add v4.4s, v18.4s, v4.4s ; CHECK-NEXT: add v2.4s, v19.4s, v2.4s -; CHECK-NEXT: mov v4.d[1], v1.d[1] ; CHECK-NEXT: mov v3.d[1], v16.d[1] ; CHECK-NEXT: mov v0.d[1], v7.d[1] +; CHECK-NEXT: mov v4.d[1], v1.d[1] ; CHECK-NEXT: mov v2.d[1], v17.d[1] -; CHECK-NEXT: cmlt v6.8h, v4.8h, #0 ; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v4.8h, #0 ; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 -; CHECK-NEXT: add v4.4s, v6.4s, v4.4s ; CHECK-NEXT: add v3.4s, v1.4s, v3.4s ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-NEXT: add v4.4s, v6.4s, v4.4s ; CHECK-NEXT: add v2.4s, v7.4s, v2.4s ; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b ; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b ; CHECK-NEXT: eor v3.16b, v4.16b, v6.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll index 29f9c0336bbcce..542b2e90ffc159 100644 --- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -9,9 +9,9 @@ define @vec_scalable_subvec_scalable_idx_zero_i8(ptr %a, ptr % ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x1] ; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h ; CHECK-NEXT: ret @@ -25,9 +25,9 @@ define @vec_scalable_subvec_scalable_idx_nonzero_i8(ptr %a, pt ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x1] ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h ; CHECK-NEXT: ret @@ -41,9 +41,9 @@ define @vec_scalable_subvec_scalable_idx_zero_i16(ptr %a, ptr ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x1] ; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: ret @@ -57,9 +57,9 @@ define @vec_scalable_subvec_scalable_idx_nonzero_i16(ptr %a, ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x1] ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: ret @@ -76,10 +76,10 @@ define @vec_scalable_subvec_fixed_idx_zero_i8(ptr %a, ptr %b) ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ptrue p1.h, vl8 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %vec = load , ptr %a %subvec = load <8 x i8>, ptr %b @@ -92,19 +92,19 @@ define @vec_scalable_subvec_fixed_idx_nonzero_i8(ptr %a, ptr % ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cnth x8 -; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: sub x8, x8, #8 ; CHECK-NEXT: mov w9, #8 // =0x8 ; CHECK-NEXT: cmp x8, #8 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] -; CHECK-NEXT: st1h { z1.h }, p0, [sp] -; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -120,10 +120,10 @@ define @vec_scalable_subvec_fixed_idx_zero_i16(ptr %a, ptr %b ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ptrue p1.s, vl4 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %vec = load , ptr %a %subvec = load <4 x i16>, ptr %b @@ -136,19 +136,19 @@ define @vec_scalable_subvec_fixed_idx_nonzero_i16(ptr %a, ptr ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cntw x8 -; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: sub x8, x8, #4 ; CHECK-NEXT: mov w9, #4 // =0x4 ; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] -; CHECK-NEXT: st1w { z1.s }, p0, [sp] -; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -164,10 +164,10 @@ define @vec_scalable_subvec_fixed_idx_zero_i32(ptr %a, ptr %b ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ptrue p1.d, vl2 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0] -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %vec = load , ptr %a %subvec = load <2 x i32>, ptr %b @@ -180,19 +180,19 @@ define @vec_scalable_subvec_fixed_idx_nonzero_i32(ptr %a, ptr ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x8 -; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: mov w9, #2 // =0x2 ; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0] -; CHECK-NEXT: st1d { z1.d }, p0, [sp] -; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll index 1a4ab6ab334a64..9bd2ed240810dc 100644 --- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll +++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s @@ -31,28 +32,28 @@ define i32 @ctz_nxv32i1( %a) #0 { ; CHECK-NEXT: neg x8, x8 ; CHECK-NEXT: punpklo p3.h, p1.b ; CHECK-NEXT: rdvl x9, #2 -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: rdvl x8, #-1 -; CHECK-NEXT: punpkhi p1.h, p1.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: inch z0.h, all, mul #4 +; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p2.h +; CHECK-NEXT: inch z0.h, all, mul #4 ; CHECK-NEXT: mov z5.h, p3/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: add z1.h, z0.h, z1.h -; CHECK-NEXT: add z4.h, z0.h, z2.h ; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z7.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add z1.h, z0.h, z1.h +; CHECK-NEXT: add z4.h, z0.h, z2.h ; CHECK-NEXT: and z0.d, z0.d, z3.d ; CHECK-NEXT: add z2.h, z1.h, z2.h ; CHECK-NEXT: and z3.d, z4.d, z5.d ; CHECK-NEXT: and z1.d, z1.d, z6.d ; CHECK-NEXT: and z2.d, z2.d, z7.d -; CHECK-NEXT: umax z0.h, p2/m, z0.h, z3.h -; CHECK-NEXT: umax z1.h, p2/m, z1.h, z2.h -; CHECK-NEXT: umax z0.h, p2/m, z0.h, z1.h -; CHECK-NEXT: umaxv h0, p2, z0.h +; CHECK-NEXT: umax z0.h, p0/m, z0.h, z3.h +; CHECK-NEXT: umax z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: sub w8, w9, w8 ; CHECK-NEXT: and w0, w8, #0xffff @@ -65,12 +66,12 @@ define i32 @ctz_nxv4i32( %a) #0 { ; CHECK-LABEL: ctz_nxv4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: index z1.s, #0, #-1 ; CHECK-NEXT: cntw x9 -; CHECK-NEXT: incw z1.s ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z0.d, z1.d, z0.d +; CHECK-NEXT: index z0.s, #0, #-1 +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: incw z0.s +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w8, s0 @@ -87,38 +88,38 @@ define i64 @vscale_4096( %a) #1 { ; CHECK-LABEL: vscale_4096: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: index z1.s, #0, #-1 ; CHECK-NEXT: cntw x8 -; CHECK-NEXT: cnth x9 ; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: neg x8, x9 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: index z0.s, #0, #-1 +; CHECK-NEXT: mov z0.s, w8 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: incw z1.s, all, mul #4 +; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: incw z0.s, all, mul #4 -; CHECK-NEXT: add z1.s, z0.s, z1.s -; CHECK-NEXT: add z5.s, z0.s, z2.s +; CHECK-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEXT: add z4.s, z1.s, z2.s ; CHECK-NEXT: punpkhi p2.h, p1.b ; CHECK-NEXT: punpkhi p3.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: add z2.s, z1.s, z2.s -; CHECK-NEXT: punpklo p1.h, p1.b ; CHECK-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p2.s -; CHECK-NEXT: mov z4.s, p3/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: add z2.s, z0.s, z2.s +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: mov z5.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z7.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: and z2.d, z2.d, z4.d -; CHECK-NEXT: and z3.d, z5.d, z6.d -; CHECK-NEXT: and z0.d, z0.d, z7.d -; CHECK-NEXT: umax z1.s, p2/m, z1.s, z2.s -; CHECK-NEXT: umax z0.s, p2/m, z0.s, z3.s -; CHECK-NEXT: umax z0.s, p2/m, z0.s, z1.s -; CHECK-NEXT: umaxv s0, p2, z0.s +; CHECK-NEXT: and z0.d, z0.d, z3.d +; CHECK-NEXT: and z2.d, z2.d, z5.d +; CHECK-NEXT: and z3.d, z4.d, z6.d +; CHECK-NEXT: and z1.d, z1.d, z7.d +; CHECK-NEXT: umax z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: umax z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret @@ -130,21 +131,21 @@ define i64 @vscale_4096_poison( %a) #1 { ; CHECK-LABEL: vscale_4096_poison: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: index z1.h, #0, #-1 ; CHECK-NEXT: cnth x8 -; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: index z0.h, #0, #-1 +; CHECK-NEXT: mov z0.h, w8 +; CHECK-NEXT: inch z1.h, all, mul #2 ; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: inch z0.h, all, mul #2 -; CHECK-NEXT: add z1.h, z0.h, z1.h +; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: and z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w8, s0 @@ -161,16 +162,16 @@ define i32 @ctz_nxv8i1_no_range( %a) { ; CHECK-LABEL: ctz_nxv8i1_no_range: ; CHECK: // %bb.0: ; CHECK-NEXT: index z0.s, #0, #-1 -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: cntw x8 -; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: cnth x9 ; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: incw z0.s, all, mul #2 ; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: incw z0.s, all, mul #2 ; CHECK-NEXT: add z1.s, z0.s, z1.s ; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: and z1.d, z1.d, z3.d @@ -212,8 +213,8 @@ define i32 @ctz_nxv16i1_poison( %pg, %a) { define i32 @ctz_and_nxv16i1( %pg, %a, %b) { ; CHECK-LABEL: ctz_and_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: brkb p0.b, p1/z, p0.b ; CHECK-NEXT: cntp x0, p0, p0.b ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index f5a7b5dc9f4922..ae4ced258bb8e2 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -6555,18 +6555,18 @@ define <3 x bfloat> @stofp_v3i64_v3bf16(<3 x i64> %a) { ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: scvtf v1.2d, v2.2d -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: scvtf v0.2d, v0.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -6582,18 +6582,18 @@ define <3 x bfloat> @utofp_v3i64_v3bf16(<3 x i64> %a) { ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ucvtf v1.2d, v2.2d -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -6606,17 +6606,17 @@ define <4 x bfloat> @stofp_v4i64_v4bf16(<4 x i64> %a) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: scvtf v0.2d, v0.2d ; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -6629,17 +6629,17 @@ define <4 x bfloat> @utofp_v4i64_v4bf16(<4 x i64> %a) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -6658,22 +6658,22 @@ define <8 x bfloat> @stofp_v8i64_v8bf16(<8 x i64> %a) { ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: ushr v4.4s, v2.4s, #16 ; CHECK-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-NEXT: add v6.4s, v2.4s, v1.4s -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v4.16b, v4.16b, v3.16b -; CHECK-NEXT: and v3.16b, v5.16b, v3.16b +; CHECK-NEXT: add v6.4s, v2.4s, v3.4s +; CHECK-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-NEXT: and v1.16b, v5.16b, v1.16b ; CHECK-NEXT: fcmeq v5.4s, v2.4s, v2.4s ; CHECK-NEXT: orr v2.4s, #64, lsl #16 ; CHECK-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fcmeq v6.4s, v0.4s, v0.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 ; CHECK-NEXT: bit v2.16b, v4.16b, v5.16b -; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-NEXT: bit v0.16b, v1.16b, v6.16b ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret entry: @@ -6692,22 +6692,22 @@ define <8 x bfloat> @utofp_v8i64_v8bf16(<8 x i64> %a) { ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: ushr v4.4s, v2.4s, #16 ; CHECK-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-NEXT: add v6.4s, v2.4s, v1.4s -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v4.16b, v4.16b, v3.16b -; CHECK-NEXT: and v3.16b, v5.16b, v3.16b +; CHECK-NEXT: add v6.4s, v2.4s, v3.4s +; CHECK-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-NEXT: and v1.16b, v5.16b, v1.16b ; CHECK-NEXT: fcmeq v5.4s, v2.4s, v2.4s ; CHECK-NEXT: orr v2.4s, #64, lsl #16 ; CHECK-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fcmeq v6.4s, v0.4s, v0.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 ; CHECK-NEXT: bit v2.16b, v4.16b, v5.16b -; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-NEXT: bit v0.16b, v1.16b, v6.16b ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret entry: @@ -6718,51 +6718,51 @@ entry: define <16 x bfloat> @stofp_v16i64_v16bf16(<16 x i64> %a) { ; CHECK-LABEL: stofp_v16i64_v16bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v2.2d, v2.2d ; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: scvtf v2.2d, v2.2d ; CHECK-NEXT: scvtf v6.2d, v6.2d ; CHECK-NEXT: scvtf v4.2d, v4.2d -; CHECK-NEXT: scvtf v3.2d, v3.2d ; CHECK-NEXT: scvtf v1.2d, v1.2d +; CHECK-NEXT: scvtf v3.2d, v3.2d ; CHECK-NEXT: scvtf v7.2d, v7.2d ; CHECK-NEXT: scvtf v5.2d, v5.2d -; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: fcvtn v6.2s, v6.2d ; CHECK-NEXT: fcvtn v4.2s, v4.2d -; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v6.4s, v7.2d ; CHECK-NEXT: fcvtn2 v4.4s, v5.2d -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: ushr v5.4s, v2.4s, #16 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: ushr v7.4s, v0.4s, #16 -; CHECK-NEXT: add v17.4s, v2.4s, v1.4s -; CHECK-NEXT: add v19.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v5.4s, v2.4s, #16 ; CHECK-NEXT: ushr v16.4s, v6.4s, #16 -; CHECK-NEXT: ushr v18.4s, v4.4s, #16 -; CHECK-NEXT: add v20.4s, v6.4s, v1.4s -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: and v5.16b, v5.16b, v3.16b -; CHECK-NEXT: and v7.16b, v7.16b, v3.16b -; CHECK-NEXT: and v16.16b, v16.16b, v3.16b -; CHECK-NEXT: and v3.16b, v18.16b, v3.16b -; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: add v5.4s, v5.4s, v17.4s +; CHECK-NEXT: ushr v17.4s, v4.4s, #16 +; CHECK-NEXT: add v19.4s, v0.4s, v3.4s +; CHECK-NEXT: add v18.4s, v2.4s, v3.4s +; CHECK-NEXT: add v20.4s, v6.4s, v3.4s +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: and v7.16b, v7.16b, v1.16b +; CHECK-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-NEXT: and v16.16b, v16.16b, v1.16b +; CHECK-NEXT: and v1.16b, v17.16b, v1.16b ; CHECK-NEXT: fcmeq v17.4s, v2.4s, v2.4s +; CHECK-NEXT: orr v2.4s, #64, lsl #16 ; CHECK-NEXT: add v7.4s, v7.4s, v19.4s ; CHECK-NEXT: fcmeq v19.4s, v6.4s, v6.4s -; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: add v16.4s, v16.4s, v20.4s -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: add v5.4s, v5.4s, v18.4s +; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fcmeq v3.4s, v4.4s, v4.4s +; CHECK-NEXT: add v16.4s, v16.4s, v20.4s +; CHECK-NEXT: orr v0.4s, #64, lsl #16 ; CHECK-NEXT: orr v6.4s, #64, lsl #16 ; CHECK-NEXT: orr v4.4s, #64, lsl #16 ; CHECK-NEXT: bit v2.16b, v5.16b, v17.16b -; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b ; CHECK-NEXT: mov v5.16b, v19.16b +; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b ; CHECK-NEXT: bif v1.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v5.16b, v16.16b, v6.16b ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h @@ -6776,51 +6776,51 @@ entry: define <16 x bfloat> @utofp_v16i64_v16bf16(<16 x i64> %a) { ; CHECK-LABEL: utofp_v16i64_v16bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v2.2d, v2.2d ; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf v2.2d, v2.2d ; CHECK-NEXT: ucvtf v6.2d, v6.2d ; CHECK-NEXT: ucvtf v4.2d, v4.2d -; CHECK-NEXT: ucvtf v3.2d, v3.2d ; CHECK-NEXT: ucvtf v1.2d, v1.2d +; CHECK-NEXT: ucvtf v3.2d, v3.2d ; CHECK-NEXT: ucvtf v7.2d, v7.2d ; CHECK-NEXT: ucvtf v5.2d, v5.2d -; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: fcvtn v6.2s, v6.2d ; CHECK-NEXT: fcvtn v4.2s, v4.2d -; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v6.4s, v7.2d ; CHECK-NEXT: fcvtn2 v4.4s, v5.2d -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: ushr v5.4s, v2.4s, #16 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: ushr v7.4s, v0.4s, #16 -; CHECK-NEXT: add v17.4s, v2.4s, v1.4s -; CHECK-NEXT: add v19.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v5.4s, v2.4s, #16 ; CHECK-NEXT: ushr v16.4s, v6.4s, #16 -; CHECK-NEXT: ushr v18.4s, v4.4s, #16 -; CHECK-NEXT: add v20.4s, v6.4s, v1.4s -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: and v5.16b, v5.16b, v3.16b -; CHECK-NEXT: and v7.16b, v7.16b, v3.16b -; CHECK-NEXT: and v16.16b, v16.16b, v3.16b -; CHECK-NEXT: and v3.16b, v18.16b, v3.16b -; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: add v5.4s, v5.4s, v17.4s +; CHECK-NEXT: ushr v17.4s, v4.4s, #16 +; CHECK-NEXT: add v19.4s, v0.4s, v3.4s +; CHECK-NEXT: add v18.4s, v2.4s, v3.4s +; CHECK-NEXT: add v20.4s, v6.4s, v3.4s +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: and v7.16b, v7.16b, v1.16b +; CHECK-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-NEXT: and v16.16b, v16.16b, v1.16b +; CHECK-NEXT: and v1.16b, v17.16b, v1.16b ; CHECK-NEXT: fcmeq v17.4s, v2.4s, v2.4s +; CHECK-NEXT: orr v2.4s, #64, lsl #16 ; CHECK-NEXT: add v7.4s, v7.4s, v19.4s ; CHECK-NEXT: fcmeq v19.4s, v6.4s, v6.4s -; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: add v16.4s, v16.4s, v20.4s -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: add v5.4s, v5.4s, v18.4s +; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fcmeq v3.4s, v4.4s, v4.4s +; CHECK-NEXT: add v16.4s, v16.4s, v20.4s +; CHECK-NEXT: orr v0.4s, #64, lsl #16 ; CHECK-NEXT: orr v6.4s, #64, lsl #16 ; CHECK-NEXT: orr v4.4s, #64, lsl #16 ; CHECK-NEXT: bit v2.16b, v5.16b, v17.16b -; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b ; CHECK-NEXT: mov v5.16b, v19.16b +; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b ; CHECK-NEXT: bif v1.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v5.16b, v16.16b, v6.16b ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h @@ -6834,107 +6834,107 @@ entry: define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) { ; CHECK-LABEL: stofp_v32i64_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v16.2d, v2.2d -; CHECK-NEXT: scvtf v17.2d, v0.2d -; CHECK-NEXT: scvtf v18.2d, v3.2d -; CHECK-NEXT: scvtf v19.2d, v6.2d -; CHECK-NEXT: ldp q24, q23, [sp, #96] -; CHECK-NEXT: scvtf v21.2d, v1.2d -; CHECK-NEXT: scvtf v22.2d, v4.2d +; CHECK-NEXT: scvtf v17.2d, v2.2d +; CHECK-NEXT: scvtf v18.2d, v0.2d +; CHECK-NEXT: scvtf v19.2d, v3.2d +; CHECK-NEXT: scvtf v3.2d, v6.2d +; CHECK-NEXT: ldp q21, q20, [sp, #32] +; CHECK-NEXT: scvtf v4.2d, v4.2d ; CHECK-NEXT: scvtf v6.2d, v7.2d -; CHECK-NEXT: scvtf v7.2d, v5.2d -; CHECK-NEXT: movi v3.4s, #127, msl #8 -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: fcvtn v0.2s, v16.2d -; CHECK-NEXT: ldp q20, q16, [sp, #32] -; CHECK-NEXT: fcvtn v1.2s, v17.2d -; CHECK-NEXT: ldp q5, q17, [sp] -; CHECK-NEXT: fcvtn v4.2s, v19.2d -; CHECK-NEXT: scvtf v23.2d, v23.2d +; CHECK-NEXT: scvtf v5.2d, v5.2d +; CHECK-NEXT: ldp q24, q23, [sp, #64] +; CHECK-NEXT: movi v16.4s, #1 +; CHECK-NEXT: fcvtn v0.2s, v17.2d +; CHECK-NEXT: scvtf v17.2d, v1.2d +; CHECK-NEXT: fcvtn v1.2s, v18.2d +; CHECK-NEXT: fcvtn v3.2s, v3.2d +; CHECK-NEXT: ldp q18, q7, [sp] +; CHECK-NEXT: scvtf v21.2d, v21.2d +; CHECK-NEXT: fcvtn v4.2s, v4.2d +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: scvtf v20.2d, v20.2d -; CHECK-NEXT: scvtf v16.2d, v16.2d -; CHECK-NEXT: fcvtn2 v0.4s, v18.2d -; CHECK-NEXT: ldp q19, q18, [sp, #64] -; CHECK-NEXT: scvtf v25.2d, v5.2d -; CHECK-NEXT: fcvtn v5.2s, v22.2d -; CHECK-NEXT: fcvtn2 v1.4s, v21.2d -; CHECK-NEXT: scvtf v21.2d, v24.2d -; CHECK-NEXT: scvtf v17.2d, v17.2d -; CHECK-NEXT: fcvtn2 v4.4s, v6.2d +; CHECK-NEXT: fcvtn2 v0.4s, v19.2d +; CHECK-NEXT: ldp q22, q19, [sp, #96] +; CHECK-NEXT: fcvtn2 v1.4s, v17.2d +; CHECK-NEXT: fcvtn2 v3.4s, v6.2d +; CHECK-NEXT: scvtf v18.2d, v18.2d +; CHECK-NEXT: scvtf v17.2d, v24.2d +; CHECK-NEXT: fcvtn v6.2s, v21.2d +; CHECK-NEXT: fcvtn2 v4.4s, v5.2d +; CHECK-NEXT: scvtf v22.2d, v22.2d +; CHECK-NEXT: scvtf v21.2d, v23.2d +; CHECK-NEXT: scvtf v7.2d, v7.2d +; CHECK-NEXT: ushr v24.4s, v0.4s, #16 +; CHECK-NEXT: add v5.4s, v0.4s, v2.4s ; CHECK-NEXT: scvtf v19.2d, v19.2d -; CHECK-NEXT: scvtf v6.2d, v18.2d -; CHECK-NEXT: fcvtn v18.2s, v20.2d -; CHECK-NEXT: ushr v22.4s, v0.4s, #16 -; CHECK-NEXT: add v20.4s, v0.4s, v3.4s -; CHECK-NEXT: fcvtn2 v5.4s, v7.2d -; CHECK-NEXT: fcvtn v24.2s, v25.2d -; CHECK-NEXT: ushr v7.4s, v1.4s, #16 -; CHECK-NEXT: fcvtn v21.2s, v21.2d -; CHECK-NEXT: add v26.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v27.4s, v4.4s, #16 -; CHECK-NEXT: fcvtn v19.2s, v19.2d -; CHECK-NEXT: fcvtn2 v18.4s, v16.2d -; CHECK-NEXT: and v22.16b, v22.16b, v2.16b -; CHECK-NEXT: and v7.16b, v7.16b, v2.16b -; CHECK-NEXT: fcmeq v25.4s, v0.4s, v0.4s +; CHECK-NEXT: ushr v23.4s, v1.4s, #16 +; CHECK-NEXT: ushr v25.4s, v3.4s, #16 +; CHECK-NEXT: fcvtn v18.2s, v18.2d +; CHECK-NEXT: fcvtn2 v6.4s, v20.2d +; CHECK-NEXT: add v26.4s, v1.4s, v2.4s +; CHECK-NEXT: fcvtn v17.2s, v17.2d +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b +; CHECK-NEXT: fcvtn v22.2s, v22.2d +; CHECK-NEXT: fcmeq v20.4s, v0.4s, v0.4s +; CHECK-NEXT: and v23.16b, v23.16b, v16.16b ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: ushr v28.4s, v5.4s, #16 -; CHECK-NEXT: fcvtn2 v24.4s, v17.2d -; CHECK-NEXT: add v17.4s, v5.4s, v3.4s -; CHECK-NEXT: fcvtn2 v21.4s, v23.2d -; CHECK-NEXT: and v16.16b, v27.16b, v2.16b -; CHECK-NEXT: add v20.4s, v22.4s, v20.4s -; CHECK-NEXT: fcvtn2 v19.4s, v6.2d -; CHECK-NEXT: add v7.4s, v7.4s, v26.4s -; CHECK-NEXT: ushr v26.4s, v18.4s, #16 -; CHECK-NEXT: and v23.16b, v28.16b, v2.16b -; CHECK-NEXT: add v22.4s, v4.4s, v3.4s -; CHECK-NEXT: fcmeq v6.4s, v1.4s, v1.4s -; CHECK-NEXT: ushr v27.4s, v24.4s, #16 -; CHECK-NEXT: add v30.4s, v24.4s, v3.4s +; CHECK-NEXT: fcmeq v27.4s, v3.4s, v3.4s +; CHECK-NEXT: fcvtn2 v18.4s, v7.2d +; CHECK-NEXT: add v7.4s, v3.4s, v2.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 +; CHECK-NEXT: add v5.4s, v24.4s, v5.4s +; CHECK-NEXT: and v24.16b, v25.16b, v16.16b +; CHECK-NEXT: ushr v25.4s, v4.4s, #16 +; CHECK-NEXT: fcvtn2 v22.4s, v19.2d +; CHECK-NEXT: add v19.4s, v23.4s, v26.4s +; CHECK-NEXT: ushr v26.4s, v6.4s, #16 +; CHECK-NEXT: fcvtn2 v17.4s, v21.2d +; CHECK-NEXT: fcmeq v21.4s, v1.4s, v1.4s ; CHECK-NEXT: orr v1.4s, #64, lsl #16 -; CHECK-NEXT: ushr v28.4s, v21.4s, #16 -; CHECK-NEXT: add v31.4s, v21.4s, v3.4s -; CHECK-NEXT: and v26.16b, v26.16b, v2.16b -; CHECK-NEXT: add v17.4s, v23.4s, v17.4s -; CHECK-NEXT: add v23.4s, v18.4s, v3.4s -; CHECK-NEXT: ushr v29.4s, v19.4s, #16 -; CHECK-NEXT: and v27.16b, v27.16b, v2.16b -; CHECK-NEXT: add v3.4s, v19.4s, v3.4s -; CHECK-NEXT: add v16.4s, v16.4s, v22.4s -; CHECK-NEXT: and v28.16b, v28.16b, v2.16b -; CHECK-NEXT: fcmeq v22.4s, v4.4s, v4.4s -; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: and v2.16b, v29.16b, v2.16b -; CHECK-NEXT: fcmeq v29.4s, v5.4s, v5.4s -; CHECK-NEXT: orr v5.4s, #64, lsl #16 -; CHECK-NEXT: add v23.4s, v26.4s, v23.4s -; CHECK-NEXT: fcmeq v26.4s, v18.4s, v18.4s -; CHECK-NEXT: add v27.4s, v27.4s, v30.4s -; CHECK-NEXT: fcmeq v30.4s, v24.4s, v24.4s +; CHECK-NEXT: and v23.16b, v25.16b, v16.16b +; CHECK-NEXT: add v25.4s, v4.4s, v2.4s +; CHECK-NEXT: add v7.4s, v24.4s, v7.4s +; CHECK-NEXT: ushr v24.4s, v18.4s, #16 +; CHECK-NEXT: add v30.4s, v18.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v5.16b, v20.16b +; CHECK-NEXT: ushr v28.4s, v22.4s, #16 +; CHECK-NEXT: add v31.4s, v22.4s, v2.4s +; CHECK-NEXT: add v23.4s, v23.4s, v25.4s +; CHECK-NEXT: and v25.16b, v26.16b, v16.16b +; CHECK-NEXT: add v26.4s, v6.4s, v2.4s +; CHECK-NEXT: ushr v29.4s, v17.4s, #16 +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b +; CHECK-NEXT: add v2.4s, v17.4s, v2.4s +; CHECK-NEXT: and v28.16b, v28.16b, v16.16b +; CHECK-NEXT: bit v3.16b, v7.16b, v27.16b +; CHECK-NEXT: bit v1.16b, v19.16b, v21.16b +; CHECK-NEXT: add v25.4s, v25.4s, v26.4s +; CHECK-NEXT: fcmeq v26.4s, v6.4s, v6.4s +; CHECK-NEXT: orr v6.4s, #64, lsl #16 +; CHECK-NEXT: and v16.16b, v29.16b, v16.16b +; CHECK-NEXT: add v24.4s, v24.4s, v30.4s +; CHECK-NEXT: fcmeq v30.4s, v18.4s, v18.4s ; CHECK-NEXT: add v28.4s, v28.4s, v31.4s -; CHECK-NEXT: fcmeq v31.4s, v21.4s, v21.4s -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmeq v3.4s, v19.4s, v19.4s +; CHECK-NEXT: fcmeq v31.4s, v22.4s, v22.4s +; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s +; CHECK-NEXT: orr v4.4s, #64, lsl #16 ; CHECK-NEXT: orr v18.4s, #64, lsl #16 -; CHECK-NEXT: orr v24.4s, #64, lsl #16 -; CHECK-NEXT: orr v21.4s, #64, lsl #16 -; CHECK-NEXT: orr v19.4s, #64, lsl #16 -; CHECK-NEXT: bit v1.16b, v7.16b, v6.16b -; CHECK-NEXT: bit v4.16b, v16.16b, v22.16b -; CHECK-NEXT: mov v6.16b, v26.16b -; CHECK-NEXT: mov v7.16b, v30.16b -; CHECK-NEXT: mov v16.16b, v31.16b -; CHECK-NEXT: bit v0.16b, v20.16b, v25.16b -; CHECK-NEXT: bit v5.16b, v17.16b, v29.16b -; CHECK-NEXT: bsl v3.16b, v2.16b, v19.16b -; CHECK-NEXT: bsl v6.16b, v23.16b, v18.16b -; CHECK-NEXT: bsl v7.16b, v27.16b, v24.16b -; CHECK-NEXT: bsl v16.16b, v28.16b, v21.16b +; CHECK-NEXT: orr v22.4s, #64, lsl #16 +; CHECK-NEXT: mov v5.16b, v26.16b +; CHECK-NEXT: add v2.4s, v16.4s, v2.4s +; CHECK-NEXT: fcmeq v16.4s, v17.4s, v17.4s +; CHECK-NEXT: orr v17.4s, #64, lsl #16 ; CHECK-NEXT: uzp2 v0.8h, v1.8h, v0.8h -; CHECK-NEXT: uzp2 v1.8h, v5.8h, v4.8h -; CHECK-NEXT: uzp2 v2.8h, v7.8h, v6.8h -; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h +; CHECK-NEXT: mov v7.16b, v31.16b +; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b +; CHECK-NEXT: bsl v5.16b, v25.16b, v6.16b +; CHECK-NEXT: mov v6.16b, v30.16b +; CHECK-NEXT: bsl v16.16b, v2.16b, v17.16b +; CHECK-NEXT: bsl v7.16b, v28.16b, v22.16b +; CHECK-NEXT: bsl v6.16b, v24.16b, v18.16b +; CHECK-NEXT: uzp2 v1.8h, v4.8h, v3.8h +; CHECK-NEXT: uzp2 v3.8h, v16.8h, v7.8h +; CHECK-NEXT: uzp2 v2.8h, v6.8h, v5.8h ; CHECK-NEXT: ret entry: %c = sitofp <32 x i64> %a to <32 x bfloat> @@ -6944,107 +6944,107 @@ entry: define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) { ; CHECK-LABEL: utofp_v32i64_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v16.2d, v2.2d -; CHECK-NEXT: ucvtf v17.2d, v0.2d -; CHECK-NEXT: ucvtf v18.2d, v3.2d -; CHECK-NEXT: ucvtf v19.2d, v6.2d -; CHECK-NEXT: ldp q24, q23, [sp, #96] -; CHECK-NEXT: ucvtf v21.2d, v1.2d -; CHECK-NEXT: ucvtf v22.2d, v4.2d +; CHECK-NEXT: ucvtf v17.2d, v2.2d +; CHECK-NEXT: ucvtf v18.2d, v0.2d +; CHECK-NEXT: ucvtf v19.2d, v3.2d +; CHECK-NEXT: ucvtf v3.2d, v6.2d +; CHECK-NEXT: ldp q21, q20, [sp, #32] +; CHECK-NEXT: ucvtf v4.2d, v4.2d ; CHECK-NEXT: ucvtf v6.2d, v7.2d -; CHECK-NEXT: ucvtf v7.2d, v5.2d -; CHECK-NEXT: movi v3.4s, #127, msl #8 -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: fcvtn v0.2s, v16.2d -; CHECK-NEXT: ldp q20, q16, [sp, #32] -; CHECK-NEXT: fcvtn v1.2s, v17.2d -; CHECK-NEXT: ldp q5, q17, [sp] -; CHECK-NEXT: fcvtn v4.2s, v19.2d -; CHECK-NEXT: ucvtf v23.2d, v23.2d +; CHECK-NEXT: ucvtf v5.2d, v5.2d +; CHECK-NEXT: ldp q24, q23, [sp, #64] +; CHECK-NEXT: movi v16.4s, #1 +; CHECK-NEXT: fcvtn v0.2s, v17.2d +; CHECK-NEXT: ucvtf v17.2d, v1.2d +; CHECK-NEXT: fcvtn v1.2s, v18.2d +; CHECK-NEXT: fcvtn v3.2s, v3.2d +; CHECK-NEXT: ldp q18, q7, [sp] +; CHECK-NEXT: ucvtf v21.2d, v21.2d +; CHECK-NEXT: fcvtn v4.2s, v4.2d +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: ucvtf v20.2d, v20.2d -; CHECK-NEXT: ucvtf v16.2d, v16.2d -; CHECK-NEXT: fcvtn2 v0.4s, v18.2d -; CHECK-NEXT: ldp q19, q18, [sp, #64] -; CHECK-NEXT: ucvtf v25.2d, v5.2d -; CHECK-NEXT: fcvtn v5.2s, v22.2d -; CHECK-NEXT: fcvtn2 v1.4s, v21.2d -; CHECK-NEXT: ucvtf v21.2d, v24.2d -; CHECK-NEXT: ucvtf v17.2d, v17.2d -; CHECK-NEXT: fcvtn2 v4.4s, v6.2d +; CHECK-NEXT: fcvtn2 v0.4s, v19.2d +; CHECK-NEXT: ldp q22, q19, [sp, #96] +; CHECK-NEXT: fcvtn2 v1.4s, v17.2d +; CHECK-NEXT: fcvtn2 v3.4s, v6.2d +; CHECK-NEXT: ucvtf v18.2d, v18.2d +; CHECK-NEXT: ucvtf v17.2d, v24.2d +; CHECK-NEXT: fcvtn v6.2s, v21.2d +; CHECK-NEXT: fcvtn2 v4.4s, v5.2d +; CHECK-NEXT: ucvtf v22.2d, v22.2d +; CHECK-NEXT: ucvtf v21.2d, v23.2d +; CHECK-NEXT: ucvtf v7.2d, v7.2d +; CHECK-NEXT: ushr v24.4s, v0.4s, #16 +; CHECK-NEXT: add v5.4s, v0.4s, v2.4s ; CHECK-NEXT: ucvtf v19.2d, v19.2d -; CHECK-NEXT: ucvtf v6.2d, v18.2d -; CHECK-NEXT: fcvtn v18.2s, v20.2d -; CHECK-NEXT: ushr v22.4s, v0.4s, #16 -; CHECK-NEXT: add v20.4s, v0.4s, v3.4s -; CHECK-NEXT: fcvtn2 v5.4s, v7.2d -; CHECK-NEXT: fcvtn v24.2s, v25.2d -; CHECK-NEXT: ushr v7.4s, v1.4s, #16 -; CHECK-NEXT: fcvtn v21.2s, v21.2d -; CHECK-NEXT: add v26.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v27.4s, v4.4s, #16 -; CHECK-NEXT: fcvtn v19.2s, v19.2d -; CHECK-NEXT: fcvtn2 v18.4s, v16.2d -; CHECK-NEXT: and v22.16b, v22.16b, v2.16b -; CHECK-NEXT: and v7.16b, v7.16b, v2.16b -; CHECK-NEXT: fcmeq v25.4s, v0.4s, v0.4s +; CHECK-NEXT: ushr v23.4s, v1.4s, #16 +; CHECK-NEXT: ushr v25.4s, v3.4s, #16 +; CHECK-NEXT: fcvtn v18.2s, v18.2d +; CHECK-NEXT: fcvtn2 v6.4s, v20.2d +; CHECK-NEXT: add v26.4s, v1.4s, v2.4s +; CHECK-NEXT: fcvtn v17.2s, v17.2d +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b +; CHECK-NEXT: fcvtn v22.2s, v22.2d +; CHECK-NEXT: fcmeq v20.4s, v0.4s, v0.4s +; CHECK-NEXT: and v23.16b, v23.16b, v16.16b ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: ushr v28.4s, v5.4s, #16 -; CHECK-NEXT: fcvtn2 v24.4s, v17.2d -; CHECK-NEXT: add v17.4s, v5.4s, v3.4s -; CHECK-NEXT: fcvtn2 v21.4s, v23.2d -; CHECK-NEXT: and v16.16b, v27.16b, v2.16b -; CHECK-NEXT: add v20.4s, v22.4s, v20.4s -; CHECK-NEXT: fcvtn2 v19.4s, v6.2d -; CHECK-NEXT: add v7.4s, v7.4s, v26.4s -; CHECK-NEXT: ushr v26.4s, v18.4s, #16 -; CHECK-NEXT: and v23.16b, v28.16b, v2.16b -; CHECK-NEXT: add v22.4s, v4.4s, v3.4s -; CHECK-NEXT: fcmeq v6.4s, v1.4s, v1.4s -; CHECK-NEXT: ushr v27.4s, v24.4s, #16 -; CHECK-NEXT: add v30.4s, v24.4s, v3.4s +; CHECK-NEXT: fcmeq v27.4s, v3.4s, v3.4s +; CHECK-NEXT: fcvtn2 v18.4s, v7.2d +; CHECK-NEXT: add v7.4s, v3.4s, v2.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 +; CHECK-NEXT: add v5.4s, v24.4s, v5.4s +; CHECK-NEXT: and v24.16b, v25.16b, v16.16b +; CHECK-NEXT: ushr v25.4s, v4.4s, #16 +; CHECK-NEXT: fcvtn2 v22.4s, v19.2d +; CHECK-NEXT: add v19.4s, v23.4s, v26.4s +; CHECK-NEXT: ushr v26.4s, v6.4s, #16 +; CHECK-NEXT: fcvtn2 v17.4s, v21.2d +; CHECK-NEXT: fcmeq v21.4s, v1.4s, v1.4s ; CHECK-NEXT: orr v1.4s, #64, lsl #16 -; CHECK-NEXT: ushr v28.4s, v21.4s, #16 -; CHECK-NEXT: add v31.4s, v21.4s, v3.4s -; CHECK-NEXT: and v26.16b, v26.16b, v2.16b -; CHECK-NEXT: add v17.4s, v23.4s, v17.4s -; CHECK-NEXT: add v23.4s, v18.4s, v3.4s -; CHECK-NEXT: ushr v29.4s, v19.4s, #16 -; CHECK-NEXT: and v27.16b, v27.16b, v2.16b -; CHECK-NEXT: add v3.4s, v19.4s, v3.4s -; CHECK-NEXT: add v16.4s, v16.4s, v22.4s -; CHECK-NEXT: and v28.16b, v28.16b, v2.16b -; CHECK-NEXT: fcmeq v22.4s, v4.4s, v4.4s -; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: and v2.16b, v29.16b, v2.16b -; CHECK-NEXT: fcmeq v29.4s, v5.4s, v5.4s -; CHECK-NEXT: orr v5.4s, #64, lsl #16 -; CHECK-NEXT: add v23.4s, v26.4s, v23.4s -; CHECK-NEXT: fcmeq v26.4s, v18.4s, v18.4s -; CHECK-NEXT: add v27.4s, v27.4s, v30.4s -; CHECK-NEXT: fcmeq v30.4s, v24.4s, v24.4s +; CHECK-NEXT: and v23.16b, v25.16b, v16.16b +; CHECK-NEXT: add v25.4s, v4.4s, v2.4s +; CHECK-NEXT: add v7.4s, v24.4s, v7.4s +; CHECK-NEXT: ushr v24.4s, v18.4s, #16 +; CHECK-NEXT: add v30.4s, v18.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v5.16b, v20.16b +; CHECK-NEXT: ushr v28.4s, v22.4s, #16 +; CHECK-NEXT: add v31.4s, v22.4s, v2.4s +; CHECK-NEXT: add v23.4s, v23.4s, v25.4s +; CHECK-NEXT: and v25.16b, v26.16b, v16.16b +; CHECK-NEXT: add v26.4s, v6.4s, v2.4s +; CHECK-NEXT: ushr v29.4s, v17.4s, #16 +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b +; CHECK-NEXT: add v2.4s, v17.4s, v2.4s +; CHECK-NEXT: and v28.16b, v28.16b, v16.16b +; CHECK-NEXT: bit v3.16b, v7.16b, v27.16b +; CHECK-NEXT: bit v1.16b, v19.16b, v21.16b +; CHECK-NEXT: add v25.4s, v25.4s, v26.4s +; CHECK-NEXT: fcmeq v26.4s, v6.4s, v6.4s +; CHECK-NEXT: orr v6.4s, #64, lsl #16 +; CHECK-NEXT: and v16.16b, v29.16b, v16.16b +; CHECK-NEXT: add v24.4s, v24.4s, v30.4s +; CHECK-NEXT: fcmeq v30.4s, v18.4s, v18.4s ; CHECK-NEXT: add v28.4s, v28.4s, v31.4s -; CHECK-NEXT: fcmeq v31.4s, v21.4s, v21.4s -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmeq v3.4s, v19.4s, v19.4s +; CHECK-NEXT: fcmeq v31.4s, v22.4s, v22.4s +; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s +; CHECK-NEXT: orr v4.4s, #64, lsl #16 ; CHECK-NEXT: orr v18.4s, #64, lsl #16 -; CHECK-NEXT: orr v24.4s, #64, lsl #16 -; CHECK-NEXT: orr v21.4s, #64, lsl #16 -; CHECK-NEXT: orr v19.4s, #64, lsl #16 -; CHECK-NEXT: bit v1.16b, v7.16b, v6.16b -; CHECK-NEXT: bit v4.16b, v16.16b, v22.16b -; CHECK-NEXT: mov v6.16b, v26.16b -; CHECK-NEXT: mov v7.16b, v30.16b -; CHECK-NEXT: mov v16.16b, v31.16b -; CHECK-NEXT: bit v0.16b, v20.16b, v25.16b -; CHECK-NEXT: bit v5.16b, v17.16b, v29.16b -; CHECK-NEXT: bsl v3.16b, v2.16b, v19.16b -; CHECK-NEXT: bsl v6.16b, v23.16b, v18.16b -; CHECK-NEXT: bsl v7.16b, v27.16b, v24.16b -; CHECK-NEXT: bsl v16.16b, v28.16b, v21.16b +; CHECK-NEXT: orr v22.4s, #64, lsl #16 +; CHECK-NEXT: mov v5.16b, v26.16b +; CHECK-NEXT: add v2.4s, v16.4s, v2.4s +; CHECK-NEXT: fcmeq v16.4s, v17.4s, v17.4s +; CHECK-NEXT: orr v17.4s, #64, lsl #16 ; CHECK-NEXT: uzp2 v0.8h, v1.8h, v0.8h -; CHECK-NEXT: uzp2 v1.8h, v5.8h, v4.8h -; CHECK-NEXT: uzp2 v2.8h, v7.8h, v6.8h -; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h +; CHECK-NEXT: mov v7.16b, v31.16b +; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b +; CHECK-NEXT: bsl v5.16b, v25.16b, v6.16b +; CHECK-NEXT: mov v6.16b, v30.16b +; CHECK-NEXT: bsl v16.16b, v2.16b, v17.16b +; CHECK-NEXT: bsl v7.16b, v28.16b, v22.16b +; CHECK-NEXT: bsl v6.16b, v24.16b, v18.16b +; CHECK-NEXT: uzp2 v1.8h, v4.8h, v3.8h +; CHECK-NEXT: uzp2 v3.8h, v16.8h, v7.8h +; CHECK-NEXT: uzp2 v2.8h, v6.8h, v5.8h ; CHECK-NEXT: ret entry: %c = uitofp <32 x i64> %a to <32 x bfloat> @@ -7059,9 +7059,9 @@ define <2 x bfloat> @stofp_v2i32_v2bf16(<2 x i32> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <2 x i32> %a to <2 x bfloat> @@ -7076,9 +7076,9 @@ define <2 x bfloat> @utofp_v2i32_v2bf16(<2 x i32> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <2 x i32> %a to <2 x bfloat> @@ -7092,9 +7092,9 @@ define <3 x bfloat> @stofp_v3i32_v3bf16(<3 x i32> %a) { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <3 x i32> %a to <3 x bfloat> @@ -7108,9 +7108,9 @@ define <3 x bfloat> @utofp_v3i32_v3bf16(<3 x i32> %a) { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <3 x i32> %a to <3 x bfloat> @@ -7124,9 +7124,9 @@ define <4 x bfloat> @stofp_v4i32_v4bf16(<4 x i32> %a) { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <4 x i32> %a to <4 x bfloat> @@ -7140,9 +7140,9 @@ define <4 x bfloat> @utofp_v4i32_v4bf16(<4 x i32> %a) { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <4 x i32> %a to <4 x bfloat> @@ -7155,15 +7155,15 @@ define <8 x bfloat> @stofp_v8i32_v8bf16(<8 x i32> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: scvtf v1.4s, v1.4s +; CHECK-NEXT: movi v5.4s, #127, msl #8 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 ; CHECK-NEXT: ushr v4.4s, v1.4s, #16 ; CHECK-NEXT: and v3.16b, v3.16b, v2.16b ; CHECK-NEXT: and v2.16b, v4.16b, v2.16b ; CHECK-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v3.4s -; CHECK-NEXT: addhn2 v0.8h, v1.4s, v3.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-NEXT: addhn2 v0.8h, v1.4s, v5.4s ; CHECK-NEXT: ret entry: %c = sitofp <8 x i32> %a to <8 x bfloat> @@ -7176,15 +7176,15 @@ define <8 x bfloat> @utofp_v8i32_v8bf16(<8 x i32> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: ucvtf v1.4s, v1.4s +; CHECK-NEXT: movi v5.4s, #127, msl #8 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 ; CHECK-NEXT: ushr v4.4s, v1.4s, #16 ; CHECK-NEXT: and v3.16b, v3.16b, v2.16b ; CHECK-NEXT: and v2.16b, v4.16b, v2.16b ; CHECK-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v3.4s -; CHECK-NEXT: addhn2 v0.8h, v1.4s, v3.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-NEXT: addhn2 v0.8h, v1.4s, v5.4s ; CHECK-NEXT: ret entry: %c = uitofp <8 x i32> %a to <8 x bfloat> @@ -7194,28 +7194,28 @@ entry: define <16 x bfloat> @stofp_v16i32_v16bf16(<16 x i32> %a) { ; CHECK-LABEL: stofp_v16i32_v16bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: scvtf v2.4s, v2.4s -; CHECK-NEXT: movi v4.4s, #1 -; CHECK-NEXT: scvtf v1.4s, v1.4s +; CHECK-NEXT: scvtf v0.4s, v0.4s +; CHECK-NEXT: scvtf v4.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: scvtf v3.4s, v3.4s +; CHECK-NEXT: movi v17.4s, #127, msl #8 ; CHECK-NEXT: ushr v5.4s, v0.4s, #16 ; CHECK-NEXT: ushr v6.4s, v2.4s, #16 -; CHECK-NEXT: ushr v7.4s, v1.4s, #16 +; CHECK-NEXT: ushr v7.4s, v4.4s, #16 ; CHECK-NEXT: ushr v16.4s, v3.4s, #16 -; CHECK-NEXT: and v5.16b, v5.16b, v4.16b -; CHECK-NEXT: and v6.16b, v6.16b, v4.16b +; CHECK-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-NEXT: and v6.16b, v6.16b, v1.16b ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-NEXT: movi v6.4s, #127, msl #8 -; CHECK-NEXT: and v5.16b, v7.16b, v4.16b -; CHECK-NEXT: and v4.16b, v16.16b, v4.16b -; CHECK-NEXT: add v5.4s, v5.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v6.4s -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s -; CHECK-NEXT: addhn v1.4h, v2.4s, v6.4s -; CHECK-NEXT: addhn2 v0.8h, v5.4s, v6.4s -; CHECK-NEXT: addhn2 v1.8h, v3.4s, v6.4s +; CHECK-NEXT: and v5.16b, v7.16b, v1.16b +; CHECK-NEXT: and v6.16b, v16.16b, v1.16b +; CHECK-NEXT: addhn v0.4h, v0.4s, v17.4s +; CHECK-NEXT: addhn v1.4h, v2.4s, v17.4s +; CHECK-NEXT: add v2.4s, v5.4s, v4.4s +; CHECK-NEXT: add v3.4s, v6.4s, v3.4s +; CHECK-NEXT: addhn2 v0.8h, v2.4s, v17.4s +; CHECK-NEXT: addhn2 v1.8h, v3.4s, v17.4s ; CHECK-NEXT: ret entry: %c = sitofp <16 x i32> %a to <16 x bfloat> @@ -7225,28 +7225,28 @@ entry: define <16 x bfloat> @utofp_v16i32_v16bf16(<16 x i32> %a) { ; CHECK-LABEL: utofp_v16i32_v16bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ucvtf v2.4s, v2.4s -; CHECK-NEXT: movi v4.4s, #1 -; CHECK-NEXT: ucvtf v1.4s, v1.4s +; CHECK-NEXT: ucvtf v0.4s, v0.4s +; CHECK-NEXT: ucvtf v4.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ucvtf v3.4s, v3.4s +; CHECK-NEXT: movi v17.4s, #127, msl #8 ; CHECK-NEXT: ushr v5.4s, v0.4s, #16 ; CHECK-NEXT: ushr v6.4s, v2.4s, #16 -; CHECK-NEXT: ushr v7.4s, v1.4s, #16 +; CHECK-NEXT: ushr v7.4s, v4.4s, #16 ; CHECK-NEXT: ushr v16.4s, v3.4s, #16 -; CHECK-NEXT: and v5.16b, v5.16b, v4.16b -; CHECK-NEXT: and v6.16b, v6.16b, v4.16b +; CHECK-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-NEXT: and v6.16b, v6.16b, v1.16b ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-NEXT: movi v6.4s, #127, msl #8 -; CHECK-NEXT: and v5.16b, v7.16b, v4.16b -; CHECK-NEXT: and v4.16b, v16.16b, v4.16b -; CHECK-NEXT: add v5.4s, v5.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v6.4s -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s -; CHECK-NEXT: addhn v1.4h, v2.4s, v6.4s -; CHECK-NEXT: addhn2 v0.8h, v5.4s, v6.4s -; CHECK-NEXT: addhn2 v1.8h, v3.4s, v6.4s +; CHECK-NEXT: and v5.16b, v7.16b, v1.16b +; CHECK-NEXT: and v6.16b, v16.16b, v1.16b +; CHECK-NEXT: addhn v0.4h, v0.4s, v17.4s +; CHECK-NEXT: addhn v1.4h, v2.4s, v17.4s +; CHECK-NEXT: add v2.4s, v5.4s, v4.4s +; CHECK-NEXT: add v3.4s, v6.4s, v3.4s +; CHECK-NEXT: addhn2 v0.8h, v2.4s, v17.4s +; CHECK-NEXT: addhn2 v1.8h, v3.4s, v17.4s ; CHECK-NEXT: ret entry: %c = uitofp <16 x i32> %a to <16 x bfloat> @@ -7262,42 +7262,42 @@ define <32 x bfloat> @stofp_v32i32_v32bf16(<32 x i32> %a) { ; CHECK-NEXT: scvtf v6.4s, v6.4s ; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: scvtf v1.4s, v1.4s -; CHECK-NEXT: scvtf v3.4s, v3.4s +; CHECK-NEXT: scvtf v17.4s, v3.4s ; CHECK-NEXT: scvtf v5.4s, v5.4s ; CHECK-NEXT: scvtf v7.4s, v7.4s -; CHECK-NEXT: ushr v17.4s, v0.4s, #16 +; CHECK-NEXT: movi v21.4s, #127, msl #8 +; CHECK-NEXT: ushr v3.4s, v0.4s, #16 ; CHECK-NEXT: ushr v18.4s, v2.4s, #16 ; CHECK-NEXT: ushr v19.4s, v4.4s, #16 ; CHECK-NEXT: ushr v20.4s, v6.4s, #16 -; CHECK-NEXT: ushr v21.4s, v1.4s, #16 -; CHECK-NEXT: ushr v22.4s, v3.4s, #16 -; CHECK-NEXT: ushr v23.4s, v5.4s, #16 -; CHECK-NEXT: and v17.16b, v17.16b, v16.16b +; CHECK-NEXT: ushr v22.4s, v1.4s, #16 +; CHECK-NEXT: ushr v23.4s, v17.4s, #16 +; CHECK-NEXT: ushr v24.4s, v5.4s, #16 +; CHECK-NEXT: ushr v25.4s, v7.4s, #16 +; CHECK-NEXT: and v3.16b, v3.16b, v16.16b ; CHECK-NEXT: and v18.16b, v18.16b, v16.16b ; CHECK-NEXT: and v19.16b, v19.16b, v16.16b ; CHECK-NEXT: and v20.16b, v20.16b, v16.16b -; CHECK-NEXT: and v21.16b, v21.16b, v16.16b -; CHECK-NEXT: and v22.16b, v22.16b, v16.16b -; CHECK-NEXT: add v0.4s, v17.4s, v0.4s -; CHECK-NEXT: ushr v17.4s, v7.4s, #16 +; CHECK-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-NEXT: and v3.16b, v22.16b, v16.16b ; CHECK-NEXT: add v2.4s, v18.4s, v2.4s -; CHECK-NEXT: movi v18.4s, #127, msl #8 ; CHECK-NEXT: add v4.4s, v19.4s, v4.4s ; CHECK-NEXT: add v6.4s, v20.4s, v6.4s -; CHECK-NEXT: and v19.16b, v23.16b, v16.16b -; CHECK-NEXT: add v20.4s, v22.4s, v3.4s -; CHECK-NEXT: and v16.16b, v17.16b, v16.16b -; CHECK-NEXT: add v17.4s, v21.4s, v1.4s +; CHECK-NEXT: and v18.16b, v23.16b, v16.16b +; CHECK-NEXT: and v19.16b, v24.16b, v16.16b +; CHECK-NEXT: and v16.16b, v25.16b, v16.16b +; CHECK-NEXT: add v20.4s, v3.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v21.4s +; CHECK-NEXT: addhn v1.4h, v2.4s, v21.4s +; CHECK-NEXT: addhn v2.4h, v4.4s, v21.4s +; CHECK-NEXT: addhn v3.4h, v6.4s, v21.4s +; CHECK-NEXT: add v4.4s, v18.4s, v17.4s ; CHECK-NEXT: add v5.4s, v19.4s, v5.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v18.4s -; CHECK-NEXT: addhn v1.4h, v2.4s, v18.4s -; CHECK-NEXT: addhn v2.4h, v4.4s, v18.4s -; CHECK-NEXT: add v4.4s, v16.4s, v7.4s -; CHECK-NEXT: addhn v3.4h, v6.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v17.4s, v18.4s -; CHECK-NEXT: addhn2 v1.8h, v20.4s, v18.4s -; CHECK-NEXT: addhn2 v2.8h, v5.4s, v18.4s -; CHECK-NEXT: addhn2 v3.8h, v4.4s, v18.4s +; CHECK-NEXT: add v6.4s, v16.4s, v7.4s +; CHECK-NEXT: addhn2 v0.8h, v20.4s, v21.4s +; CHECK-NEXT: addhn2 v1.8h, v4.4s, v21.4s +; CHECK-NEXT: addhn2 v2.8h, v5.4s, v21.4s +; CHECK-NEXT: addhn2 v3.8h, v6.4s, v21.4s ; CHECK-NEXT: ret entry: %c = sitofp <32 x i32> %a to <32 x bfloat> @@ -7313,42 +7313,42 @@ define <32 x bfloat> @utofp_v32i32_v32bf16(<32 x i32> %a) { ; CHECK-NEXT: ucvtf v6.4s, v6.4s ; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: ucvtf v1.4s, v1.4s -; CHECK-NEXT: ucvtf v3.4s, v3.4s +; CHECK-NEXT: ucvtf v17.4s, v3.4s ; CHECK-NEXT: ucvtf v5.4s, v5.4s ; CHECK-NEXT: ucvtf v7.4s, v7.4s -; CHECK-NEXT: ushr v17.4s, v0.4s, #16 +; CHECK-NEXT: movi v21.4s, #127, msl #8 +; CHECK-NEXT: ushr v3.4s, v0.4s, #16 ; CHECK-NEXT: ushr v18.4s, v2.4s, #16 ; CHECK-NEXT: ushr v19.4s, v4.4s, #16 ; CHECK-NEXT: ushr v20.4s, v6.4s, #16 -; CHECK-NEXT: ushr v21.4s, v1.4s, #16 -; CHECK-NEXT: ushr v22.4s, v3.4s, #16 -; CHECK-NEXT: ushr v23.4s, v5.4s, #16 -; CHECK-NEXT: and v17.16b, v17.16b, v16.16b +; CHECK-NEXT: ushr v22.4s, v1.4s, #16 +; CHECK-NEXT: ushr v23.4s, v17.4s, #16 +; CHECK-NEXT: ushr v24.4s, v5.4s, #16 +; CHECK-NEXT: ushr v25.4s, v7.4s, #16 +; CHECK-NEXT: and v3.16b, v3.16b, v16.16b ; CHECK-NEXT: and v18.16b, v18.16b, v16.16b ; CHECK-NEXT: and v19.16b, v19.16b, v16.16b ; CHECK-NEXT: and v20.16b, v20.16b, v16.16b -; CHECK-NEXT: and v21.16b, v21.16b, v16.16b -; CHECK-NEXT: and v22.16b, v22.16b, v16.16b -; CHECK-NEXT: add v0.4s, v17.4s, v0.4s -; CHECK-NEXT: ushr v17.4s, v7.4s, #16 +; CHECK-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-NEXT: and v3.16b, v22.16b, v16.16b ; CHECK-NEXT: add v2.4s, v18.4s, v2.4s -; CHECK-NEXT: movi v18.4s, #127, msl #8 ; CHECK-NEXT: add v4.4s, v19.4s, v4.4s ; CHECK-NEXT: add v6.4s, v20.4s, v6.4s -; CHECK-NEXT: and v19.16b, v23.16b, v16.16b -; CHECK-NEXT: add v20.4s, v22.4s, v3.4s -; CHECK-NEXT: and v16.16b, v17.16b, v16.16b -; CHECK-NEXT: add v17.4s, v21.4s, v1.4s +; CHECK-NEXT: and v18.16b, v23.16b, v16.16b +; CHECK-NEXT: and v19.16b, v24.16b, v16.16b +; CHECK-NEXT: and v16.16b, v25.16b, v16.16b +; CHECK-NEXT: add v20.4s, v3.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v21.4s +; CHECK-NEXT: addhn v1.4h, v2.4s, v21.4s +; CHECK-NEXT: addhn v2.4h, v4.4s, v21.4s +; CHECK-NEXT: addhn v3.4h, v6.4s, v21.4s +; CHECK-NEXT: add v4.4s, v18.4s, v17.4s ; CHECK-NEXT: add v5.4s, v19.4s, v5.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v18.4s -; CHECK-NEXT: addhn v1.4h, v2.4s, v18.4s -; CHECK-NEXT: addhn v2.4h, v4.4s, v18.4s -; CHECK-NEXT: add v4.4s, v16.4s, v7.4s -; CHECK-NEXT: addhn v3.4h, v6.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v17.4s, v18.4s -; CHECK-NEXT: addhn2 v1.8h, v20.4s, v18.4s -; CHECK-NEXT: addhn2 v2.8h, v5.4s, v18.4s -; CHECK-NEXT: addhn2 v3.8h, v4.4s, v18.4s +; CHECK-NEXT: add v6.4s, v16.4s, v7.4s +; CHECK-NEXT: addhn2 v0.8h, v20.4s, v21.4s +; CHECK-NEXT: addhn2 v1.8h, v4.4s, v21.4s +; CHECK-NEXT: addhn2 v2.8h, v5.4s, v21.4s +; CHECK-NEXT: addhn2 v3.8h, v6.4s, v21.4s ; CHECK-NEXT: ret entry: %c = uitofp <32 x i32> %a to <32 x bfloat> @@ -7364,9 +7364,9 @@ define <2 x bfloat> @stofp_v2i16_v2bf16(<2 x i16> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <2 x i16> %a to <2 x bfloat> @@ -7382,9 +7382,9 @@ define <2 x bfloat> @utofp_v2i16_v2bf16(<2 x i16> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <2 x i16> %a to <2 x bfloat> @@ -7399,9 +7399,9 @@ define <3 x bfloat> @stofp_v3i16_v3bf16(<3 x i16> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <3 x i16> %a to <3 x bfloat> @@ -7416,9 +7416,9 @@ define <3 x bfloat> @utofp_v3i16_v3bf16(<3 x i16> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <3 x i16> %a to <3 x bfloat> @@ -7433,9 +7433,9 @@ define <4 x bfloat> @stofp_v4i16_v4bf16(<4 x i16> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <4 x i16> %a to <4 x bfloat> @@ -7450,9 +7450,9 @@ define <4 x bfloat> @utofp_v4i16_v4bf16(<4 x i16> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <4 x i16> %a to <4 x bfloat> @@ -7513,27 +7513,27 @@ define <16 x bfloat> @stofp_v16i16_v16bf16(<16 x i16> %a) { ; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0 ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v7.4s, #127, msl #8 ; CHECK-NEXT: scvtf v3.4s, v3.4s ; CHECK-NEXT: scvtf v4.4s, v4.4s -; CHECK-NEXT: scvtf v6.4s, v0.4s -; CHECK-NEXT: scvtf v7.4s, v1.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: ushr v5.4s, v3.4s, #16 -; CHECK-NEXT: ushr v0.4s, v4.4s, #16 -; CHECK-NEXT: ushr v16.4s, v6.4s, #16 -; CHECK-NEXT: ushr v17.4s, v7.4s, #16 -; CHECK-NEXT: and v5.16b, v5.16b, v2.16b +; CHECK-NEXT: scvtf v5.4s, v0.4s +; CHECK-NEXT: scvtf v6.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v3.4s, #16 +; CHECK-NEXT: ushr v1.4s, v4.4s, #16 +; CHECK-NEXT: ushr v16.4s, v5.4s, #16 +; CHECK-NEXT: ushr v17.4s, v6.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: and v16.16b, v16.16b, v2.16b ; CHECK-NEXT: and v2.16b, v17.16b, v2.16b -; CHECK-NEXT: add v5.4s, v5.4s, v1.4s -; CHECK-NEXT: add v18.4s, v0.4s, v1.4s -; CHECK-NEXT: add v2.4s, v2.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v3.4s, v5.4s -; CHECK-NEXT: add v3.4s, v16.4s, v1.4s -; CHECK-NEXT: addhn v1.4h, v4.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v6.4s, v3.4s -; CHECK-NEXT: addhn2 v1.8h, v7.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v7.4s +; CHECK-NEXT: add v1.4s, v1.4s, v7.4s +; CHECK-NEXT: add v2.4s, v2.4s, v7.4s +; CHECK-NEXT: addhn v0.4h, v3.4s, v0.4s +; CHECK-NEXT: addhn v1.4h, v4.4s, v1.4s +; CHECK-NEXT: add v3.4s, v16.4s, v7.4s +; CHECK-NEXT: addhn2 v0.8h, v5.4s, v3.4s +; CHECK-NEXT: addhn2 v1.8h, v6.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <16 x i16> %a to <16 x bfloat> @@ -7548,27 +7548,27 @@ define <16 x bfloat> @utofp_v16i16_v16bf16(<16 x i16> %a) { ; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v7.4s, #127, msl #8 ; CHECK-NEXT: ucvtf v3.4s, v3.4s ; CHECK-NEXT: ucvtf v4.4s, v4.4s -; CHECK-NEXT: ucvtf v6.4s, v0.4s -; CHECK-NEXT: ucvtf v7.4s, v1.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: ushr v5.4s, v3.4s, #16 -; CHECK-NEXT: ushr v0.4s, v4.4s, #16 -; CHECK-NEXT: ushr v16.4s, v6.4s, #16 -; CHECK-NEXT: ushr v17.4s, v7.4s, #16 -; CHECK-NEXT: and v5.16b, v5.16b, v2.16b +; CHECK-NEXT: ucvtf v5.4s, v0.4s +; CHECK-NEXT: ucvtf v6.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v3.4s, #16 +; CHECK-NEXT: ushr v1.4s, v4.4s, #16 +; CHECK-NEXT: ushr v16.4s, v5.4s, #16 +; CHECK-NEXT: ushr v17.4s, v6.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: and v16.16b, v16.16b, v2.16b ; CHECK-NEXT: and v2.16b, v17.16b, v2.16b -; CHECK-NEXT: add v5.4s, v5.4s, v1.4s -; CHECK-NEXT: add v18.4s, v0.4s, v1.4s -; CHECK-NEXT: add v2.4s, v2.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v3.4s, v5.4s -; CHECK-NEXT: add v3.4s, v16.4s, v1.4s -; CHECK-NEXT: addhn v1.4h, v4.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v6.4s, v3.4s -; CHECK-NEXT: addhn2 v1.8h, v7.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v7.4s +; CHECK-NEXT: add v1.4s, v1.4s, v7.4s +; CHECK-NEXT: add v2.4s, v2.4s, v7.4s +; CHECK-NEXT: addhn v0.4h, v3.4s, v0.4s +; CHECK-NEXT: addhn v1.4h, v4.4s, v1.4s +; CHECK-NEXT: add v3.4s, v16.4s, v7.4s +; CHECK-NEXT: addhn2 v0.8h, v5.4s, v3.4s +; CHECK-NEXT: addhn2 v1.8h, v6.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <16 x i16> %a to <16 x bfloat> @@ -7578,56 +7578,56 @@ entry: define <32 x bfloat> @stofp_v32i16_v32bf16(<32 x i16> %a) { ; CHECK-LABEL: stofp_v32i16_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v5.4s, v1.4h, #0 -; CHECK-NEXT: sshll v4.4s, v0.4h, #0 +; CHECK-NEXT: sshll v4.4s, v1.4h, #0 +; CHECK-NEXT: sshll v5.4s, v0.4h, #0 ; CHECK-NEXT: sshll v6.4s, v2.4h, #0 ; CHECK-NEXT: sshll v7.4s, v3.4h, #0 ; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: scvtf v5.4s, v5.4s ; CHECK-NEXT: scvtf v4.4s, v4.4s ; CHECK-NEXT: scvtf v6.4s, v6.4s ; CHECK-NEXT: scvtf v7.4s, v7.4s -; CHECK-NEXT: scvtf v19.4s, v0.4s -; CHECK-NEXT: movi v18.4s, #127, msl #8 -; CHECK-NEXT: scvtf v20.4s, v1.4s -; CHECK-NEXT: scvtf v21.4s, v2.4s -; CHECK-NEXT: scvtf v22.4s, v3.4s +; CHECK-NEXT: scvtf v17.4s, v0.4s +; CHECK-NEXT: scvtf v18.4s, v1.4s +; CHECK-NEXT: scvtf v19.4s, v2.4s +; CHECK-NEXT: scvtf v20.4s, v3.4s +; CHECK-NEXT: movi v21.4s, #127, msl #8 ; CHECK-NEXT: ushr v0.4s, v5.4s, #16 -; CHECK-NEXT: ushr v17.4s, v4.4s, #16 -; CHECK-NEXT: ushr v1.4s, v6.4s, #16 -; CHECK-NEXT: ushr v2.4s, v7.4s, #16 -; CHECK-NEXT: ushr v23.4s, v20.4s, #16 -; CHECK-NEXT: ushr v25.4s, v22.4s, #16 +; CHECK-NEXT: ushr v1.4s, v4.4s, #16 +; CHECK-NEXT: ushr v2.4s, v6.4s, #16 +; CHECK-NEXT: ushr v3.4s, v7.4s, #16 +; CHECK-NEXT: ushr v22.4s, v17.4s, #16 +; CHECK-NEXT: ushr v23.4s, v18.4s, #16 +; CHECK-NEXT: ushr v24.4s, v19.4s, #16 +; CHECK-NEXT: ushr v25.4s, v20.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v16.16b -; CHECK-NEXT: and v3.16b, v17.16b, v16.16b ; CHECK-NEXT: and v1.16b, v1.16b, v16.16b ; CHECK-NEXT: and v2.16b, v2.16b, v16.16b -; CHECK-NEXT: ushr v17.4s, v19.4s, #16 +; CHECK-NEXT: and v3.16b, v3.16b, v16.16b +; CHECK-NEXT: and v22.16b, v22.16b, v16.16b ; CHECK-NEXT: and v23.16b, v23.16b, v16.16b -; CHECK-NEXT: add v24.4s, v0.4s, v18.4s -; CHECK-NEXT: ushr v0.4s, v21.4s, #16 -; CHECK-NEXT: add v3.4s, v3.4s, v18.4s -; CHECK-NEXT: add v26.4s, v1.4s, v18.4s -; CHECK-NEXT: add v27.4s, v2.4s, v18.4s -; CHECK-NEXT: and v17.16b, v17.16b, v16.16b -; CHECK-NEXT: and v28.16b, v0.16b, v16.16b +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b ; CHECK-NEXT: and v16.16b, v25.16b, v16.16b -; CHECK-NEXT: addhn v0.4h, v4.4s, v3.4s -; CHECK-NEXT: addhn v1.4h, v5.4s, v24.4s -; CHECK-NEXT: add v4.4s, v17.4s, v18.4s -; CHECK-NEXT: addhn v2.4h, v6.4s, v26.4s -; CHECK-NEXT: add v5.4s, v23.4s, v18.4s -; CHECK-NEXT: addhn v3.4h, v7.4s, v27.4s -; CHECK-NEXT: add v6.4s, v28.4s, v18.4s -; CHECK-NEXT: add v16.4s, v16.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v19.4s, v4.4s -; CHECK-NEXT: addhn2 v1.8h, v20.4s, v5.4s -; CHECK-NEXT: addhn2 v2.8h, v21.4s, v6.4s -; CHECK-NEXT: addhn2 v3.8h, v22.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v21.4s +; CHECK-NEXT: add v1.4s, v1.4s, v21.4s +; CHECK-NEXT: add v2.4s, v2.4s, v21.4s +; CHECK-NEXT: add v3.4s, v3.4s, v21.4s +; CHECK-NEXT: addhn v0.4h, v5.4s, v0.4s +; CHECK-NEXT: addhn v1.4h, v4.4s, v1.4s +; CHECK-NEXT: addhn v2.4h, v6.4s, v2.4s +; CHECK-NEXT: addhn v3.4h, v7.4s, v3.4s +; CHECK-NEXT: add v4.4s, v22.4s, v21.4s +; CHECK-NEXT: add v5.4s, v23.4s, v21.4s +; CHECK-NEXT: add v6.4s, v24.4s, v21.4s +; CHECK-NEXT: add v7.4s, v16.4s, v21.4s +; CHECK-NEXT: addhn2 v0.8h, v17.4s, v4.4s +; CHECK-NEXT: addhn2 v1.8h, v18.4s, v5.4s +; CHECK-NEXT: addhn2 v2.8h, v19.4s, v6.4s +; CHECK-NEXT: addhn2 v3.8h, v20.4s, v7.4s ; CHECK-NEXT: ret entry: %c = sitofp <32 x i16> %a to <32 x bfloat> @@ -7637,56 +7637,56 @@ entry: define <32 x bfloat> @utofp_v32i16_v32bf16(<32 x i16> %a) { ; CHECK-LABEL: utofp_v32i16_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 +; CHECK-NEXT: ushll v4.4s, v1.4h, #0 +; CHECK-NEXT: ushll v5.4s, v0.4h, #0 ; CHECK-NEXT: ushll v6.4s, v2.4h, #0 ; CHECK-NEXT: ushll v7.4s, v3.4h, #0 ; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: ucvtf v5.4s, v5.4s ; CHECK-NEXT: ucvtf v4.4s, v4.4s ; CHECK-NEXT: ucvtf v6.4s, v6.4s ; CHECK-NEXT: ucvtf v7.4s, v7.4s -; CHECK-NEXT: ucvtf v19.4s, v0.4s -; CHECK-NEXT: movi v18.4s, #127, msl #8 -; CHECK-NEXT: ucvtf v20.4s, v1.4s -; CHECK-NEXT: ucvtf v21.4s, v2.4s -; CHECK-NEXT: ucvtf v22.4s, v3.4s +; CHECK-NEXT: ucvtf v17.4s, v0.4s +; CHECK-NEXT: ucvtf v18.4s, v1.4s +; CHECK-NEXT: ucvtf v19.4s, v2.4s +; CHECK-NEXT: ucvtf v20.4s, v3.4s +; CHECK-NEXT: movi v21.4s, #127, msl #8 ; CHECK-NEXT: ushr v0.4s, v5.4s, #16 -; CHECK-NEXT: ushr v17.4s, v4.4s, #16 -; CHECK-NEXT: ushr v1.4s, v6.4s, #16 -; CHECK-NEXT: ushr v2.4s, v7.4s, #16 -; CHECK-NEXT: ushr v23.4s, v20.4s, #16 -; CHECK-NEXT: ushr v25.4s, v22.4s, #16 +; CHECK-NEXT: ushr v1.4s, v4.4s, #16 +; CHECK-NEXT: ushr v2.4s, v6.4s, #16 +; CHECK-NEXT: ushr v3.4s, v7.4s, #16 +; CHECK-NEXT: ushr v22.4s, v17.4s, #16 +; CHECK-NEXT: ushr v23.4s, v18.4s, #16 +; CHECK-NEXT: ushr v24.4s, v19.4s, #16 +; CHECK-NEXT: ushr v25.4s, v20.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v16.16b -; CHECK-NEXT: and v3.16b, v17.16b, v16.16b ; CHECK-NEXT: and v1.16b, v1.16b, v16.16b ; CHECK-NEXT: and v2.16b, v2.16b, v16.16b -; CHECK-NEXT: ushr v17.4s, v19.4s, #16 +; CHECK-NEXT: and v3.16b, v3.16b, v16.16b +; CHECK-NEXT: and v22.16b, v22.16b, v16.16b ; CHECK-NEXT: and v23.16b, v23.16b, v16.16b -; CHECK-NEXT: add v24.4s, v0.4s, v18.4s -; CHECK-NEXT: ushr v0.4s, v21.4s, #16 -; CHECK-NEXT: add v3.4s, v3.4s, v18.4s -; CHECK-NEXT: add v26.4s, v1.4s, v18.4s -; CHECK-NEXT: add v27.4s, v2.4s, v18.4s -; CHECK-NEXT: and v17.16b, v17.16b, v16.16b -; CHECK-NEXT: and v28.16b, v0.16b, v16.16b +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b ; CHECK-NEXT: and v16.16b, v25.16b, v16.16b -; CHECK-NEXT: addhn v0.4h, v4.4s, v3.4s -; CHECK-NEXT: addhn v1.4h, v5.4s, v24.4s -; CHECK-NEXT: add v4.4s, v17.4s, v18.4s -; CHECK-NEXT: addhn v2.4h, v6.4s, v26.4s -; CHECK-NEXT: add v5.4s, v23.4s, v18.4s -; CHECK-NEXT: addhn v3.4h, v7.4s, v27.4s -; CHECK-NEXT: add v6.4s, v28.4s, v18.4s -; CHECK-NEXT: add v16.4s, v16.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v19.4s, v4.4s -; CHECK-NEXT: addhn2 v1.8h, v20.4s, v5.4s -; CHECK-NEXT: addhn2 v2.8h, v21.4s, v6.4s -; CHECK-NEXT: addhn2 v3.8h, v22.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v21.4s +; CHECK-NEXT: add v1.4s, v1.4s, v21.4s +; CHECK-NEXT: add v2.4s, v2.4s, v21.4s +; CHECK-NEXT: add v3.4s, v3.4s, v21.4s +; CHECK-NEXT: addhn v0.4h, v5.4s, v0.4s +; CHECK-NEXT: addhn v1.4h, v4.4s, v1.4s +; CHECK-NEXT: addhn v2.4h, v6.4s, v2.4s +; CHECK-NEXT: addhn v3.4h, v7.4s, v3.4s +; CHECK-NEXT: add v4.4s, v22.4s, v21.4s +; CHECK-NEXT: add v5.4s, v23.4s, v21.4s +; CHECK-NEXT: add v6.4s, v24.4s, v21.4s +; CHECK-NEXT: add v7.4s, v16.4s, v21.4s +; CHECK-NEXT: addhn2 v0.8h, v17.4s, v4.4s +; CHECK-NEXT: addhn2 v1.8h, v18.4s, v5.4s +; CHECK-NEXT: addhn2 v2.8h, v19.4s, v6.4s +; CHECK-NEXT: addhn2 v3.8h, v20.4s, v7.4s ; CHECK-NEXT: ret entry: %c = uitofp <32 x i16> %a to <32 x bfloat> @@ -7768,9 +7768,9 @@ define <3 x bfloat> @stofp_v3i8_v3bf16(<3 x i8> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <3 x i8> %a to <3 x bfloat> @@ -7789,9 +7789,9 @@ define <3 x bfloat> @utofp_v3i8_v3bf16(<3 x i8> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <3 x i8> %a to <3 x bfloat> @@ -7808,9 +7808,9 @@ define <4 x bfloat> @stofp_v4i8_v4bf16(<4 x i8> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <4 x i8> %a to <4 x bfloat> @@ -7826,9 +7826,9 @@ define <4 x bfloat> @utofp_v4i8_v4bf16(<4 x i8> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <4 x i8> %a to <4 x bfloat> @@ -7909,11 +7909,11 @@ define <16 x bfloat> @stofp_v16i8_v16bf16(<16 x i8> %a) { ; CHECK-NEXT: add v5.4s, v5.4s, v7.4s ; CHECK-NEXT: add v0.4s, v0.4s, v7.4s ; CHECK-NEXT: addhn v1.4h, v3.4s, v5.4s -; CHECK-NEXT: add v3.4s, v16.4s, v7.4s -; CHECK-NEXT: add v5.4s, v17.4s, v7.4s ; CHECK-NEXT: addhn v0.4h, v4.4s, v0.4s +; CHECK-NEXT: add v3.4s, v16.4s, v7.4s +; CHECK-NEXT: add v4.4s, v17.4s, v7.4s ; CHECK-NEXT: addhn2 v1.8h, v2.4s, v3.4s -; CHECK-NEXT: addhn2 v0.8h, v6.4s, v5.4s +; CHECK-NEXT: addhn2 v0.8h, v6.4s, v4.4s ; CHECK-NEXT: ret entry: %c = sitofp <16 x i8> %a to <16 x bfloat> @@ -7946,11 +7946,11 @@ define <16 x bfloat> @utofp_v16i8_v16bf16(<16 x i8> %a) { ; CHECK-NEXT: add v5.4s, v5.4s, v7.4s ; CHECK-NEXT: add v0.4s, v0.4s, v7.4s ; CHECK-NEXT: addhn v1.4h, v3.4s, v5.4s -; CHECK-NEXT: add v3.4s, v16.4s, v7.4s -; CHECK-NEXT: add v5.4s, v17.4s, v7.4s ; CHECK-NEXT: addhn v0.4h, v4.4s, v0.4s +; CHECK-NEXT: add v3.4s, v16.4s, v7.4s +; CHECK-NEXT: add v4.4s, v17.4s, v7.4s ; CHECK-NEXT: addhn2 v1.8h, v2.4s, v3.4s -; CHECK-NEXT: addhn2 v0.8h, v6.4s, v5.4s +; CHECK-NEXT: addhn2 v0.8h, v6.4s, v4.4s ; CHECK-NEXT: ret entry: %c = uitofp <16 x i8> %a to <16 x bfloat> @@ -7961,14 +7961,14 @@ define <32 x bfloat> @stofp_v32i8_v32bf16(<32 x i8> %a) { ; CHECK-LABEL: stofp_v32i8_v32bf16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sshll2 v3.8h, v0.16b, #0 -; CHECK-NEXT: sshll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: movi v20.4s, #127, msl #8 +; CHECK-NEXT: movi v21.4s, #127, msl #8 ; CHECK-NEXT: sshll v5.4s, v3.4h, #0 -; CHECK-NEXT: sshll v6.4s, v4.4h, #0 -; CHECK-NEXT: sshll v7.4s, v0.4h, #0 +; CHECK-NEXT: sshll v6.4s, v0.4h, #0 +; CHECK-NEXT: sshll v7.4s, v4.4h, #0 ; CHECK-NEXT: sshll v16.4s, v1.4h, #0 ; CHECK-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-NEXT: sshll2 v4.4s, v4.8h, #0 @@ -7980,40 +7980,40 @@ define <32 x bfloat> @stofp_v32i8_v32bf16(<32 x i8> %a) { ; CHECK-NEXT: scvtf v16.4s, v16.4s ; CHECK-NEXT: scvtf v17.4s, v3.4s ; CHECK-NEXT: scvtf v4.4s, v4.4s -; CHECK-NEXT: scvtf v19.4s, v0.4s -; CHECK-NEXT: scvtf v21.4s, v1.4s -; CHECK-NEXT: ushr v3.4s, v5.4s, #16 -; CHECK-NEXT: ushr v18.4s, v6.4s, #16 -; CHECK-NEXT: ushr v0.4s, v7.4s, #16 -; CHECK-NEXT: ushr v1.4s, v16.4s, #16 -; CHECK-NEXT: ushr v22.4s, v17.4s, #16 -; CHECK-NEXT: ushr v23.4s, v4.4s, #16 -; CHECK-NEXT: ushr v24.4s, v19.4s, #16 -; CHECK-NEXT: ushr v25.4s, v21.4s, #16 -; CHECK-NEXT: and v3.16b, v3.16b, v2.16b -; CHECK-NEXT: and v18.16b, v18.16b, v2.16b +; CHECK-NEXT: scvtf v18.4s, v0.4s +; CHECK-NEXT: scvtf v19.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v5.4s, #16 +; CHECK-NEXT: ushr v3.4s, v6.4s, #16 +; CHECK-NEXT: ushr v1.4s, v7.4s, #16 +; CHECK-NEXT: ushr v20.4s, v16.4s, #16 +; CHECK-NEXT: ushr v23.4s, v17.4s, #16 +; CHECK-NEXT: ushr v24.4s, v4.4s, #16 +; CHECK-NEXT: ushr v22.4s, v18.4s, #16 +; CHECK-NEXT: ushr v25.4s, v19.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v3.16b, v3.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: and v22.16b, v22.16b, v2.16b +; CHECK-NEXT: and v20.16b, v20.16b, v2.16b ; CHECK-NEXT: and v23.16b, v23.16b, v2.16b ; CHECK-NEXT: and v24.16b, v24.16b, v2.16b -; CHECK-NEXT: and v2.16b, v25.16b, v2.16b -; CHECK-NEXT: add v3.4s, v3.4s, v20.4s -; CHECK-NEXT: add v18.4s, v18.4s, v20.4s -; CHECK-NEXT: add v0.4s, v0.4s, v20.4s -; CHECK-NEXT: add v26.4s, v1.4s, v20.4s -; CHECK-NEXT: addhn v1.4h, v5.4s, v3.4s -; CHECK-NEXT: addhn v3.4h, v6.4s, v18.4s -; CHECK-NEXT: addhn v0.4h, v7.4s, v0.4s -; CHECK-NEXT: add v5.4s, v22.4s, v20.4s -; CHECK-NEXT: add v6.4s, v24.4s, v20.4s -; CHECK-NEXT: add v7.4s, v23.4s, v20.4s -; CHECK-NEXT: add v18.4s, v2.4s, v20.4s -; CHECK-NEXT: addhn v2.4h, v16.4s, v26.4s -; CHECK-NEXT: addhn2 v0.8h, v19.4s, v6.4s -; CHECK-NEXT: addhn2 v1.8h, v17.4s, v5.4s +; CHECK-NEXT: and v22.16b, v22.16b, v2.16b +; CHECK-NEXT: and v25.16b, v25.16b, v2.16b +; CHECK-NEXT: add v0.4s, v0.4s, v21.4s +; CHECK-NEXT: add v3.4s, v3.4s, v21.4s +; CHECK-NEXT: add v26.4s, v1.4s, v21.4s +; CHECK-NEXT: add v20.4s, v20.4s, v21.4s +; CHECK-NEXT: addhn v1.4h, v5.4s, v0.4s +; CHECK-NEXT: addhn v0.4h, v6.4s, v3.4s +; CHECK-NEXT: addhn v3.4h, v7.4s, v26.4s +; CHECK-NEXT: addhn v2.4h, v16.4s, v20.4s +; CHECK-NEXT: add v5.4s, v22.4s, v21.4s +; CHECK-NEXT: add v6.4s, v23.4s, v21.4s +; CHECK-NEXT: add v7.4s, v24.4s, v21.4s +; CHECK-NEXT: add v16.4s, v25.4s, v21.4s +; CHECK-NEXT: addhn2 v0.8h, v18.4s, v5.4s +; CHECK-NEXT: addhn2 v1.8h, v17.4s, v6.4s ; CHECK-NEXT: addhn2 v3.8h, v4.4s, v7.4s -; CHECK-NEXT: addhn2 v2.8h, v21.4s, v18.4s +; CHECK-NEXT: addhn2 v2.8h, v19.4s, v16.4s ; CHECK-NEXT: ret entry: %c = sitofp <32 x i8> %a to <32 x bfloat> @@ -8024,14 +8024,14 @@ define <32 x bfloat> @utofp_v32i8_v32bf16(<32 x i8> %a) { ; CHECK-LABEL: utofp_v32i8_v32bf16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ushll2 v3.8h, v0.16b, #0 -; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: movi v20.4s, #127, msl #8 +; CHECK-NEXT: movi v21.4s, #127, msl #8 ; CHECK-NEXT: ushll v5.4s, v3.4h, #0 -; CHECK-NEXT: ushll v6.4s, v4.4h, #0 -; CHECK-NEXT: ushll v7.4s, v0.4h, #0 +; CHECK-NEXT: ushll v6.4s, v0.4h, #0 +; CHECK-NEXT: ushll v7.4s, v4.4h, #0 ; CHECK-NEXT: ushll v16.4s, v1.4h, #0 ; CHECK-NEXT: ushll2 v3.4s, v3.8h, #0 ; CHECK-NEXT: ushll2 v4.4s, v4.8h, #0 @@ -8043,40 +8043,40 @@ define <32 x bfloat> @utofp_v32i8_v32bf16(<32 x i8> %a) { ; CHECK-NEXT: ucvtf v16.4s, v16.4s ; CHECK-NEXT: ucvtf v17.4s, v3.4s ; CHECK-NEXT: ucvtf v4.4s, v4.4s -; CHECK-NEXT: ucvtf v19.4s, v0.4s -; CHECK-NEXT: ucvtf v21.4s, v1.4s -; CHECK-NEXT: ushr v3.4s, v5.4s, #16 -; CHECK-NEXT: ushr v18.4s, v6.4s, #16 -; CHECK-NEXT: ushr v0.4s, v7.4s, #16 -; CHECK-NEXT: ushr v1.4s, v16.4s, #16 -; CHECK-NEXT: ushr v22.4s, v17.4s, #16 -; CHECK-NEXT: ushr v23.4s, v4.4s, #16 -; CHECK-NEXT: ushr v24.4s, v19.4s, #16 -; CHECK-NEXT: ushr v25.4s, v21.4s, #16 -; CHECK-NEXT: and v3.16b, v3.16b, v2.16b -; CHECK-NEXT: and v18.16b, v18.16b, v2.16b +; CHECK-NEXT: ucvtf v18.4s, v0.4s +; CHECK-NEXT: ucvtf v19.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v5.4s, #16 +; CHECK-NEXT: ushr v3.4s, v6.4s, #16 +; CHECK-NEXT: ushr v1.4s, v7.4s, #16 +; CHECK-NEXT: ushr v20.4s, v16.4s, #16 +; CHECK-NEXT: ushr v23.4s, v17.4s, #16 +; CHECK-NEXT: ushr v24.4s, v4.4s, #16 +; CHECK-NEXT: ushr v22.4s, v18.4s, #16 +; CHECK-NEXT: ushr v25.4s, v19.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v3.16b, v3.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: and v22.16b, v22.16b, v2.16b +; CHECK-NEXT: and v20.16b, v20.16b, v2.16b ; CHECK-NEXT: and v23.16b, v23.16b, v2.16b ; CHECK-NEXT: and v24.16b, v24.16b, v2.16b -; CHECK-NEXT: and v2.16b, v25.16b, v2.16b -; CHECK-NEXT: add v3.4s, v3.4s, v20.4s -; CHECK-NEXT: add v18.4s, v18.4s, v20.4s -; CHECK-NEXT: add v0.4s, v0.4s, v20.4s -; CHECK-NEXT: add v26.4s, v1.4s, v20.4s -; CHECK-NEXT: addhn v1.4h, v5.4s, v3.4s -; CHECK-NEXT: addhn v3.4h, v6.4s, v18.4s -; CHECK-NEXT: addhn v0.4h, v7.4s, v0.4s -; CHECK-NEXT: add v5.4s, v22.4s, v20.4s -; CHECK-NEXT: add v6.4s, v24.4s, v20.4s -; CHECK-NEXT: add v7.4s, v23.4s, v20.4s -; CHECK-NEXT: add v18.4s, v2.4s, v20.4s -; CHECK-NEXT: addhn v2.4h, v16.4s, v26.4s -; CHECK-NEXT: addhn2 v0.8h, v19.4s, v6.4s -; CHECK-NEXT: addhn2 v1.8h, v17.4s, v5.4s +; CHECK-NEXT: and v22.16b, v22.16b, v2.16b +; CHECK-NEXT: and v25.16b, v25.16b, v2.16b +; CHECK-NEXT: add v0.4s, v0.4s, v21.4s +; CHECK-NEXT: add v3.4s, v3.4s, v21.4s +; CHECK-NEXT: add v26.4s, v1.4s, v21.4s +; CHECK-NEXT: add v20.4s, v20.4s, v21.4s +; CHECK-NEXT: addhn v1.4h, v5.4s, v0.4s +; CHECK-NEXT: addhn v0.4h, v6.4s, v3.4s +; CHECK-NEXT: addhn v3.4h, v7.4s, v26.4s +; CHECK-NEXT: addhn v2.4h, v16.4s, v20.4s +; CHECK-NEXT: add v5.4s, v22.4s, v21.4s +; CHECK-NEXT: add v6.4s, v23.4s, v21.4s +; CHECK-NEXT: add v7.4s, v24.4s, v21.4s +; CHECK-NEXT: add v16.4s, v25.4s, v21.4s +; CHECK-NEXT: addhn2 v0.8h, v18.4s, v5.4s +; CHECK-NEXT: addhn2 v1.8h, v17.4s, v6.4s ; CHECK-NEXT: addhn2 v3.8h, v4.4s, v7.4s -; CHECK-NEXT: addhn2 v2.8h, v21.4s, v18.4s +; CHECK-NEXT: addhn2 v2.8h, v19.4s, v16.4s ; CHECK-NEXT: ret entry: %c = uitofp <32 x i8> %a to <32 x bfloat> diff --git a/llvm/test/CodeGen/AArch64/ldexp.ll b/llvm/test/CodeGen/AArch64/ldexp.ll index 4b491051a88aa7..ba04ba1d7bb6ac 100644 --- a/llvm/test/CodeGen/AArch64/ldexp.ll +++ b/llvm/test/CodeGen/AArch64/ldexp.ll @@ -4,9 +4,9 @@ define double @testExp(double %val, i32 %a) { ; CHECK-LABEL: testExp: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: fmov d1, x8 ; CHECK-NEXT: fscale z0.d, p0/m, z0.d, z1.d @@ -22,8 +22,8 @@ declare double @ldexp(double, i32) memory(none) define float @testExpf(float %val, i32 %a) { ; CHECK-LABEL: testExpf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: fscale z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -49,9 +49,9 @@ declare fp128 @ldexpl(fp128, i32) memory(none) define half @testExpf16(half %val, i32 %a) { ; CHECK-LABEL: testExpf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fscale z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll index ab15bf564ec425..59a460923e8b71 100644 --- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll +++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll @@ -10,9 +10,9 @@ define @sdiv_i8( %a, %b) ; CHECK: // %bb.0: ; CHECK-NEXT: sunpkhi z2.h, z1.b ; CHECK-NEXT: sunpkhi z3.h, z0.b -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z4.s, z2.h ; CHECK-NEXT: sunpkhi z5.s, z3.h ; CHECK-NEXT: sunpklo z2.s, z2.h @@ -36,11 +36,11 @@ define @sdiv_i8( %a, %b) define @sdiv_i16( %a, %b) { ; CHECK-LABEL: sdiv_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z2.s, z1.h ; CHECK-NEXT: sunpkhi z3.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h @@ -140,9 +140,9 @@ define @srem_i8( %a, %b) define @srem_i16( %a, %b) { ; CHECK-LABEL: srem_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z2.s, z1.h ; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z4.s, z0.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: sunpklo z3.s, z1.h @@ -188,9 +188,9 @@ define @udiv_i8( %a, %b) ; CHECK: // %bb.0: ; CHECK-NEXT: uunpkhi z2.h, z1.b ; CHECK-NEXT: uunpkhi z3.h, z0.b -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z4.s, z2.h ; CHECK-NEXT: uunpkhi z5.s, z3.h ; CHECK-NEXT: uunpklo z2.s, z2.h @@ -214,11 +214,11 @@ define @udiv_i8( %a, %b) define @udiv_i16( %a, %b) { ; CHECK-LABEL: udiv_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z2.s, z1.h ; CHECK-NEXT: uunpkhi z3.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h @@ -261,9 +261,9 @@ define @udiv_split_i32( %a, @udiv_widen_i32( %a, %b) { ; CHECK-LABEL: udiv_widen_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %div = udiv %a, %b @@ -319,9 +319,9 @@ define @urem_i8( %a, %b) define @urem_i16( %a, %b) { ; CHECK-LABEL: urem_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z2.s, z1.h ; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z4.s, z0.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: uunpklo z3.s, z1.h @@ -558,9 +558,9 @@ define @umin_split_i64( %a, @umin_promote_i8( %a, %b) { ; CHECK-LABEL: umin_promote_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %cmp = icmp ult %a, %b @@ -704,9 +704,9 @@ define @umax_split_i16( %a, @umax_promote_i32( %a, %b) { ; CHECK-LABEL: umax_promote_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %cmp = icmp ugt %a, %b @@ -883,8 +883,8 @@ define @lsl_split_i64( %a, @lsl_promote_i16( %a, %b){ ; CHECK-LABEL: lsl_promote_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0xffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %shl = shl %a, %b @@ -982,9 +982,9 @@ define @lsr_i64( %a, %b) define @lsr_promote_i8( %a, %b){ ; CHECK-LABEL: lsr_promote_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %shr = lshr %a, %b @@ -1081,10 +1081,10 @@ declare @llvm.fshr.nxv2i64(, @fshl_i64( %a, %b, %c){ ; CHECK-LABEL: fshl_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z3.d, #63 // =0x3f ; CHECK-NEXT: mov z4.d, z2.d ; CHECK-NEXT: lsr z1.d, z1.d, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: bic z2.d, z3.d, z2.d ; CHECK-NEXT: and z4.d, z4.d, #0x3f ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z4.d @@ -1098,17 +1098,16 @@ define @fshl_i64( %a, %b define @fshl_illegal_i64( %a, %b, %c){ ; CHECK-LABEL: fshl_illegal_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z6.d, #63 // =0x3f -; CHECK-NEXT: mov z7.d, z4.d ; CHECK-NEXT: lsr z2.d, z2.d, #1 ; CHECK-NEXT: lsr z3.d, z3.d, #1 -; CHECK-NEXT: bic z4.d, z6.d, z4.d -; CHECK-NEXT: and z7.d, z7.d, #0x3f +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: bic z7.d, z6.d, z4.d +; CHECK-NEXT: and z4.d, z4.d, #0x3f ; CHECK-NEXT: bic z6.d, z6.d, z5.d ; CHECK-NEXT: and z5.d, z5.d, #0x3f -; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z7.d -; CHECK-NEXT: lsr z2.d, p0/m, z2.d, z4.d +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: lsr z2.d, p0/m, z2.d, z7.d ; CHECK-NEXT: lsr z3.d, p0/m, z3.d, z6.d ; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z5.d ; CHECK-NEXT: orr z0.d, z0.d, z2.d @@ -1121,9 +1120,9 @@ define @fshl_illegal_i64( %a, @fshl_rot_i64( %a, %b){ ; CHECK-LABEL: fshl_rot_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: subr z1.d, z1.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z2.d, z2.d, #0x3f ; CHECK-NEXT: and z1.d, z1.d, #0x3f ; CHECK-NEXT: lslr z2.d, p0/m, z2.d, z0.d @@ -1138,11 +1137,11 @@ define @fshl_rot_i64( %a, @fshl_rot_illegal_i64( %a, %b){ ; CHECK-LABEL: fshl_rot_illegal_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z4.d, z2.d ; CHECK-NEXT: subr z2.d, z2.d, #0 // =0x0 ; CHECK-NEXT: mov z5.d, z3.d ; CHECK-NEXT: subr z3.d, z3.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z4.d, z4.d, #0x3f ; CHECK-NEXT: and z2.d, z2.d, #0x3f ; CHECK-NEXT: and z5.d, z5.d, #0x3f @@ -1175,10 +1174,10 @@ define @fshl_rot_const_i64( %a){ define @fshr_i64( %a, %b, %c){ ; CHECK-LABEL: fshr_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z3.d, #63 // =0x3f ; CHECK-NEXT: mov z4.d, z2.d ; CHECK-NEXT: lsl z0.d, z0.d, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: bic z2.d, z3.d, z2.d ; CHECK-NEXT: and z4.d, z4.d, #0x3f ; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z4.d @@ -1192,9 +1191,9 @@ define @fshr_i64( %a, %b define @fshr_rot_i64( %a, %b){ ; CHECK-LABEL: fshr_rot_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: subr z1.d, z1.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z2.d, z2.d, #0x3f ; CHECK-NEXT: and z1.d, z1.d, #0x3f ; CHECK-NEXT: lsrr z2.d, p0/m, z2.d, z0.d diff --git a/llvm/test/CodeGen/AArch64/load-insert-zero.ll b/llvm/test/CodeGen/AArch64/load-insert-zero.ll index 993af08a66ddd9..23d545459295fc 100644 --- a/llvm/test/CodeGen/AArch64/load-insert-zero.ll +++ b/llvm/test/CodeGen/AArch64/load-insert-zero.ll @@ -469,18 +469,18 @@ define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef ; CHECK-NEXT: lsr w8, w8, #24 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: urhadd v1.8b, v1.8b, v2.8b -; CHECK-NEXT: str s1, [x0] ; CHECK-NEXT: add v0.8h, v0.8h, v3.8h ; CHECK-NEXT: dup v3.8b, w8 +; CHECK-NEXT: str s1, [x0] ; CHECK-NEXT: lsl x8, x1, #1 ; CHECK-NEXT: rshrn v0.8b, v0.8h, #2 ; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s ; CHECK-NEXT: str s0, [x0, x1] ; CHECK-NEXT: zip1 v3.2s, v0.2s, v3.2s ; CHECK-NEXT: ext v2.8b, v2.8b, v0.8b, #1 +; CHECK-NEXT: ext v1.8b, v3.8b, v0.8b, #1 ; CHECK-NEXT: str s2, [x0, x8] ; CHECK-NEXT: add x8, x8, x1 -; CHECK-NEXT: ext v1.8b, v3.8b, v0.8b, #1 ; CHECK-NEXT: str s1, [x0, x8] ; CHECK-NEXT: ret %5 = load i32, ptr %2, align 4 @@ -608,9 +608,9 @@ define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noun define @loadnxv8i8(ptr %p) { ; CHECK-LABEL: loadnxv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, p0/m, w8 ; CHECK-NEXT: ret %l = load i8, ptr %p @@ -631,9 +631,9 @@ define @loadnxv16i8(ptr %p) { define @loadnxv4i16(ptr %p) { ; CHECK-LABEL: loadnxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, #0 // =0x0 ; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, p0/m, w8 ; CHECK-NEXT: ret %l = load i16, ptr %p @@ -654,9 +654,9 @@ define @loadnxv8i16(ptr %p) { define @loadnxv2i32(ptr %p) { ; CHECK-LABEL: loadnxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret %l = load i32, ptr %p @@ -688,9 +688,9 @@ define @loadnxv2i64(ptr %p) { define @loadnxv4f16(ptr %p) { ; CHECK-LABEL: loadnxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: mov z0.h, #0 // =0x0 @@ -715,9 +715,9 @@ define @loadnxv8f16(ptr %p) { define @loadnxv4bf16(ptr %p) { ; CHECK-LABEL: loadnxv4bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: mov z0.h, #0 // =0x0 @@ -742,9 +742,9 @@ define @loadnxv8bf16(ptr %p) { define @loadnxv2f32(ptr %p) { ; CHECK-LABEL: loadnxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d ; CHECK-NEXT: mov z0.s, #0 // =0x0 @@ -782,9 +782,9 @@ define @loadnxv2f64(ptr %p) { define @loadnxv8i8_offset(ptr %p) { ; CHECK-LABEL: loadnxv8i8_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, p0/m, w8 ; CHECK-NEXT: ret %g = getelementptr inbounds i8, ptr %p, i64 1 @@ -807,9 +807,9 @@ define @loadnxv16i8_offset(ptr %p) { define @loadnxv4i16_offset(ptr %p) { ; CHECK-LABEL: loadnxv4i16_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, #0 // =0x0 ; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, p0/m, w8 ; CHECK-NEXT: ret %g = getelementptr inbounds i8, ptr %p, i64 1 @@ -832,9 +832,9 @@ define @loadnxv8i16_offset(ptr %p) { define @loadnxv2i32_offset(ptr %p) { ; CHECK-LABEL: loadnxv2i32_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: ldur w8, [x0, #1] +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret %g = getelementptr inbounds i8, ptr %p, i64 1 @@ -869,9 +869,9 @@ define @loadnxv2i64_offset(ptr %p) { define @loadnxv4f16_offset(ptr %p) { ; CHECK-LABEL: loadnxv4f16_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: mov z0.h, #0 // =0x0 @@ -898,9 +898,9 @@ define @loadnxv8f16_offset(ptr %p) { define @loadnxv4bf16_offset(ptr %p) { ; CHECK-LABEL: loadnxv4bf16_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: mov z0.h, #0 // =0x0 @@ -927,9 +927,9 @@ define @loadnxv8bf16_offset(ptr %p) { define @loadnxv2f32_offset(ptr %p) { ; CHECK-LABEL: loadnxv2f32_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d ; CHECK-NEXT: mov z0.s, #0 // =0x0 diff --git a/llvm/test/CodeGen/AArch64/logic-shift.ll b/llvm/test/CodeGen/AArch64/logic-shift.ll index 39f82dd4593fb6..31047954401cf5 100644 --- a/llvm/test/CodeGen/AArch64/logic-shift.ll +++ b/llvm/test/CodeGen/AArch64/logic-shift.ll @@ -34,9 +34,9 @@ define i32 @or_lshr_commute1(i32 %x0, i32 %x1, i32 %y, i32 %z) { define <8 x i16> @or_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, <8 x i16> %z) { ; CHECK-LABEL: or_lshr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.8h, v2.8h ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.8h, v0.8h, v2.8h +; CHECK-NEXT: neg v1.8h, v2.8h +; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <8 x i16> %x0, %y @@ -49,9 +49,9 @@ define <8 x i16> @or_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, < define <2 x i64> @or_lshr_commute3(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %y, <2 x i64> %z) { ; CHECK-LABEL: or_lshr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.2d, v2.2d ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.2d, v0.2d, v2.2d +; CHECK-NEXT: neg v1.2d, v2.2d +; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <2 x i64> %x0, %y @@ -94,9 +94,9 @@ define i64 @or_ashr_commute1(i64 %x0, i64 %x1, i64 %y, i64 %z) { define <4 x i32> @or_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: or_ashr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: neg v1.4s, v2.4s +; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <4 x i32> %x0, %y @@ -109,9 +109,9 @@ define <4 x i32> @or_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, < define <16 x i8> @or_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <16 x i8> %z) { ; CHECK-LABEL: or_ashr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.16b, v2.16b ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.16b, v0.16b, v2.16b +; CHECK-NEXT: neg v1.16b, v2.16b +; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <16 x i8> %x0, %y @@ -262,9 +262,9 @@ define i32 @xor_lshr_commute1(i32 %x0, i32 %x1, i32 %y, i32 %z) { define <8 x i16> @xor_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, <8 x i16> %z) { ; CHECK-LABEL: xor_lshr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.8h, v2.8h ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.8h, v0.8h, v2.8h +; CHECK-NEXT: neg v1.8h, v2.8h +; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <8 x i16> %x0, %y @@ -277,9 +277,9 @@ define <8 x i16> @xor_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, define <2 x i64> @xor_lshr_commute3(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %y, <2 x i64> %z) { ; CHECK-LABEL: xor_lshr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.2d, v2.2d ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.2d, v0.2d, v2.2d +; CHECK-NEXT: neg v1.2d, v2.2d +; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <2 x i64> %x0, %y @@ -322,9 +322,9 @@ define i64 @xor_ashr_commute1(i64 %x0, i64 %x1, i64 %y, i64 %z) { define <4 x i32> @xor_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: xor_ashr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: neg v1.4s, v2.4s +; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <4 x i32> %x0, %y @@ -337,9 +337,9 @@ define <4 x i32> @xor_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, define <16 x i8> @xor_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <16 x i8> %z) { ; CHECK-LABEL: xor_ashr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.16b, v2.16b ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.16b, v0.16b, v2.16b +; CHECK-NEXT: neg v1.16b, v2.16b +; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <16 x i8> %x0, %y @@ -490,9 +490,9 @@ define i32 @and_lshr_commute1(i32 %x0, i32 %x1, i32 %y, i32 %z) { define <8 x i16> @and_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, <8 x i16> %z) { ; CHECK-LABEL: and_lshr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.8h, v2.8h ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.8h, v0.8h, v2.8h +; CHECK-NEXT: neg v1.8h, v2.8h +; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <8 x i16> %x0, %y @@ -505,9 +505,9 @@ define <8 x i16> @and_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, define <2 x i64> @and_lshr_commute3(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %y, <2 x i64> %z) { ; CHECK-LABEL: and_lshr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.2d, v2.2d ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.2d, v0.2d, v2.2d +; CHECK-NEXT: neg v1.2d, v2.2d +; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <2 x i64> %x0, %y @@ -550,9 +550,9 @@ define i64 @and_ashr_commute1(i64 %x0, i64 %x1, i64 %y, i64 %z) { define <4 x i32> @and_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: and_ashr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: neg v1.4s, v2.4s +; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <4 x i32> %x0, %y @@ -565,9 +565,9 @@ define <4 x i32> @and_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, define <16 x i8> @and_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <16 x i8> %z) { ; CHECK-LABEL: and_ashr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.16b, v2.16b ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.16b, v0.16b, v2.16b +; CHECK-NEXT: neg v1.16b, v2.16b +; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <16 x i8> %x0, %y diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll index 06570b4539cc11..fac96e07de541d 100644 --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -258,10 +258,10 @@ define @splice_nxv2i1_idx( %a, @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 1) ret %res @@ -273,10 +273,10 @@ define @splice_nxv4i1_idx( %a, @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 2) ret %res @@ -288,10 +288,10 @@ define @splice_nxv8i1_idx( %a, @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 4) ret %res @@ -303,10 +303,10 @@ define @splice_nxv16i1_idx( %a, @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 8) ret %res @@ -350,16 +350,16 @@ define @splice_nxv16f32_16( %a, @splice_nxv16i8_neg17( %a, @splice_nxv8i16_neg9( %a, @splice_nxv8f16_neg9( %a, @splice_nxv2i1( %a, ; CHECK-NEXT: ptrue p2.d, vl1 ; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 +; CHECK-NEXT: rev p0.d, p2.d +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: rev p2.d, p2.d -; CHECK-NEXT: splice z1.d, p2, z1.d, z0.d ; CHECK-NEXT: and z1.d, z1.d, #0x1 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ret @@ -716,9 +716,9 @@ define @splice_nxv4i1( %a, ; CHECK-NEXT: ptrue p2.s, vl1 ; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: rev p0.s, p2.s +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: rev p2.s, p2.s -; CHECK-NEXT: splice z1.s, p2, z1.s, z0.s ; CHECK-NEXT: and z1.s, z1.s, #0x1 ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: ret @@ -733,9 +733,9 @@ define @splice_nxv8i1( %a, ; CHECK-NEXT: ptrue p2.h, vl1 ; CHECK-NEXT: mov z0.h, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: rev p0.h, p2.h +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: rev p2.h, p2.h -; CHECK-NEXT: splice z1.h, p2, z1.h, z0.h ; CHECK-NEXT: and z1.h, z1.h, #0x1 ; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 ; CHECK-NEXT: ret @@ -750,9 +750,9 @@ define @splice_nxv16i1( %a, @splice_nxv8i32( %a, @splice_nxv16f32_neg17( %a, %a, <24 x i8> %b, <24 x i8> %c, <24 ; CHECK-NEXT: ld1 { v5.b }[15], [x8] ; CHECK-NEXT: ld1 { v7.b }[7], [x10] ; CHECK-NEXT: addp v1.2s, v19.2s, v19.2s +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: sdot v16.4s, v5.16b, v4.16b ; CHECK-NEXT: sdot v18.2s, v7.8b, v6.8b -; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: addv s2, v16.4s ; CHECK-NEXT: addp v3.2s, v18.2s, v18.2s @@ -975,8 +975,8 @@ define i32 @test_sdot_v24i8_double_nomla(<24 x i8> %a, <24 x i8> %b, <24 x i8> % ; CHECK-NEXT: addv s3, v5.4s ; CHECK-NEXT: addp v1.2s, v17.2s, v17.2s ; CHECK-NEXT: addp v2.2s, v7.2s, v7.2s -; CHECK-NEXT: addv s0, v6.4s ; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: addv s0, v6.4s ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: fmov w11, s2 ; CHECK-NEXT: fmov w8, s0 @@ -998,26 +998,26 @@ entry: define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v25i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q4, q0, [x0] +; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ldp q5, q1, [x1] -; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 -; CHECK-NEXT: ushll v6.8h, v4.8b, #0 +; CHECK-NEXT: ushll2 v3.8h, v0.16b, #0 +; CHECK-NEXT: ushll v6.8h, v2.8b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: ushll v7.8h, v5.8b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h -; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: umull v3.4s, v4.4h, v3.4h +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: umull2 v16.4s, v7.8h, v6.8h ; CHECK-NEXT: umull v6.4s, v7.4h, v6.4h -; CHECK-NEXT: mov v3.s[0], v2.s[0] -; CHECK-NEXT: ushll2 v2.8h, v4.16b, #0 -; CHECK-NEXT: ushll2 v4.8h, v5.16b, #0 -; CHECK-NEXT: umlal v6.4s, v1.4h, v0.4h +; CHECK-NEXT: mov v4.s[0], v3.s[0] +; CHECK-NEXT: ushll2 v3.8h, v5.16b, #0 ; CHECK-NEXT: umlal2 v16.4s, v1.8h, v0.8h -; CHECK-NEXT: umlal v3.4s, v4.4h, v2.4h -; CHECK-NEXT: umlal2 v16.4s, v4.8h, v2.8h -; CHECK-NEXT: add v0.4s, v6.4s, v3.4s +; CHECK-NEXT: umlal v6.4s, v1.4h, v0.4h +; CHECK-NEXT: umlal v4.4s, v3.4h, v2.4h +; CHECK-NEXT: umlal2 v16.4s, v3.8h, v2.8h +; CHECK-NEXT: add v0.4s, v6.4s, v4.4s ; CHECK-NEXT: add v0.4s, v0.4s, v16.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -1039,17 +1039,17 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: ushll v4.8h, v2.8b, #0 -; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0 ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 -; CHECK-NEXT: uaddl2 v5.4s, v4.8h, v3.8h -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: uaddl v3.4s, v4.4h, v3.4h -; CHECK-NEXT: mov v0.s[0], v1.s[0] -; CHECK-NEXT: uaddw2 v1.4s, v5.4s, v2.8h +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: uaddl2 v5.4s, v4.8h, v1.8h +; CHECK-NEXT: uaddl v1.4s, v4.4h, v1.4h +; CHECK-NEXT: mov v0.s[0], v3.s[0] +; CHECK-NEXT: uaddw2 v3.4s, v5.4s, v2.8h +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 @@ -1063,26 +1063,26 @@ entry: define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v25i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q4, q0, [x0] +; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ldp q5, q1, [x1] -; CHECK-NEXT: sshll2 v2.8h, v0.16b, #0 -; CHECK-NEXT: sshll v6.8h, v4.8b, #0 +; CHECK-NEXT: sshll2 v3.8h, v0.16b, #0 +; CHECK-NEXT: sshll v6.8h, v2.8b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: sshll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: sshll v7.8h, v5.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: smull v2.4s, v3.4h, v2.4h -; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: smull v3.4s, v4.4h, v3.4h +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: smull2 v16.4s, v7.8h, v6.8h ; CHECK-NEXT: smull v6.4s, v7.4h, v6.4h -; CHECK-NEXT: mov v3.s[0], v2.s[0] -; CHECK-NEXT: sshll2 v2.8h, v4.16b, #0 -; CHECK-NEXT: sshll2 v4.8h, v5.16b, #0 -; CHECK-NEXT: smlal v6.4s, v1.4h, v0.4h +; CHECK-NEXT: mov v4.s[0], v3.s[0] +; CHECK-NEXT: sshll2 v3.8h, v5.16b, #0 ; CHECK-NEXT: smlal2 v16.4s, v1.8h, v0.8h -; CHECK-NEXT: smlal v3.4s, v4.4h, v2.4h -; CHECK-NEXT: smlal2 v16.4s, v4.8h, v2.8h -; CHECK-NEXT: add v0.4s, v6.4s, v3.4s +; CHECK-NEXT: smlal v6.4s, v1.4h, v0.4h +; CHECK-NEXT: smlal v4.4s, v3.4h, v2.4h +; CHECK-NEXT: smlal2 v16.4s, v3.8h, v2.8h +; CHECK-NEXT: add v0.4s, v6.4s, v4.4s ; CHECK-NEXT: add v0.4s, v0.4s, v16.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -1105,222 +1105,222 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: fmov s4, w0 -; CHECK-NEXT: ldr b0, [sp, #80] -; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b1, [sp, #16] -; CHECK-NEXT: add x10, sp, #24 -; CHECK-NEXT: ldr b2, [sp, #280] -; CHECK-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-NEXT: ldr b0, [sp, #80] +; CHECK-NEXT: add x11, sp, #24 ; CHECK-NEXT: ldr b3, [sp, #216] +; CHECK-NEXT: add x10, sp, #88 +; CHECK-NEXT: ldr b2, [sp, #280] +; CHECK-NEXT: ld1 { v1.b }[1], [x11] ; CHECK-NEXT: add x11, sp, #224 -; CHECK-NEXT: mov v4.b[1], w1 -; CHECK-NEXT: ld1 { v1.b }[1], [x10] +; CHECK-NEXT: ldr b4, [sp, #152] +; CHECK-NEXT: ldr b6, [sp, #480] +; CHECK-NEXT: ld1 { v0.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #288 -; CHECK-NEXT: ldr b5, [sp, #152] -; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: add x12, sp, #160 ; CHECK-NEXT: ld1 { v3.b }[1], [x11] -; CHECK-NEXT: add x10, sp, #160 -; CHECK-NEXT: ld1 { v0.b }[2], [x9] -; CHECK-NEXT: ld1 { v5.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #32 -; CHECK-NEXT: add x11, sp, #296 -; CHECK-NEXT: mov v4.b[2], w2 -; CHECK-NEXT: ld1 { v1.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #232 +; CHECK-NEXT: add x11, sp, #488 +; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: ld1 { v4.b }[1], [x12] +; CHECK-NEXT: ld1 { v6.b }[1], [x11] +; CHECK-NEXT: add x11, sp, #32 +; CHECK-NEXT: add x9, sp, #96 ; CHECK-NEXT: add x8, sp, #104 -; CHECK-NEXT: ld1 { v2.b }[2], [x11] -; CHECK-NEXT: ld1 { v3.b }[2], [x10] +; CHECK-NEXT: ld1 { v1.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #232 +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #296 +; CHECK-NEXT: ld1 { v3.b }[2], [x11] ; CHECK-NEXT: add x11, sp, #168 +; CHECK-NEXT: ld1 { v2.b }[2], [x9] +; CHECK-NEXT: ld1 { v4.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #40 +; CHECK-NEXT: ld1 { v1.b }[3], [x11] ; CHECK-NEXT: ld1 { v0.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #40 -; CHECK-NEXT: ld1 { v5.b }[2], [x11] -; CHECK-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #240 -; CHECK-NEXT: mov v4.b[3], w3 -; CHECK-NEXT: ld1 { v3.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: add x12, sp, #112 -; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: ld1 { v5.b }[3], [x8] -; CHECK-NEXT: ld1 { v0.b }[4], [x12] -; CHECK-NEXT: add x12, sp, #184 -; CHECK-NEXT: ld1 { v1.b }[4], [x13] -; CHECK-NEXT: add x15, sp, #56 -; CHECK-NEXT: add x14, sp, #128 -; CHECK-NEXT: mov v4.b[4], w4 -; CHECK-NEXT: add x11, sp, #304 -; CHECK-NEXT: add x13, sp, #256 -; CHECK-NEXT: ld1 { v5.b }[4], [x12] -; CHECK-NEXT: ld1 { v0.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #192 -; CHECK-NEXT: add x12, sp, #248 -; CHECK-NEXT: ld1 { v1.b }[5], [x15] -; CHECK-NEXT: add x15, sp, #200 -; CHECK-NEXT: ld1 { v3.b }[4], [x12] -; CHECK-NEXT: ld1 { v2.b }[3], [x11] +; CHECK-NEXT: add x8, sp, #304 +; CHECK-NEXT: add x10, sp, #112 +; CHECK-NEXT: add x11, sp, #240 +; CHECK-NEXT: add x13, sp, #56 +; CHECK-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: ld1 { v3.b }[3], [x11] +; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: ld1 { v0.b }[4], [x10] +; CHECK-NEXT: add x15, sp, #312 +; CHECK-NEXT: add x12, sp, #120 +; CHECK-NEXT: add x8, sp, #248 ; CHECK-NEXT: add x11, sp, #64 -; CHECK-NEXT: mov v4.b[5], w5 -; CHECK-NEXT: ld1 { v5.b }[5], [x9] -; CHECK-NEXT: ld1 { v0.b }[6], [x14] -; CHECK-NEXT: ldr b6, [sp, #352] -; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v1.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #360 -; CHECK-NEXT: ld1 { v3.b }[5], [x13] +; CHECK-NEXT: ld1 { v2.b }[4], [x15] +; CHECK-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-NEXT: add x15, sp, #320 +; CHECK-NEXT: ld1 { v1.b }[5], [x13] +; CHECK-NEXT: ld1 { v0.b }[5], [x12] ; CHECK-NEXT: ldr b18, [sp, #552] -; CHECK-NEXT: ld1 { v5.b }[6], [x15] -; CHECK-NEXT: add x14, sp, #208 -; CHECK-NEXT: ld1 { v6.b }[1], [x11] -; CHECK-NEXT: mov v4.b[6], w6 -; CHECK-NEXT: ld1 { v0.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #560 -; CHECK-NEXT: add x9, sp, #264 -; CHECK-NEXT: ld1 { v18.b }[1], [x10] +; CHECK-NEXT: add x14, sp, #128 +; CHECK-NEXT: add x16, sp, #256 +; CHECK-NEXT: ldr b16, [sp, #352] +; CHECK-NEXT: ld1 { v2.b }[5], [x15] +; CHECK-NEXT: add x15, sp, #176 +; CHECK-NEXT: ld1 { v3.b }[5], [x16] +; CHECK-NEXT: ld1 { v1.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #560 +; CHECK-NEXT: ld1 { v0.b }[6], [x14] +; CHECK-NEXT: add x16, sp, #360 +; CHECK-NEXT: ld1 { v4.b }[3], [x15] +; CHECK-NEXT: ld1 { v18.b }[1], [x11] +; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: ld1 { v16.b }[1], [x16] +; CHECK-NEXT: add x9, sp, #136 +; CHECK-NEXT: add x14, sp, #184 +; CHECK-NEXT: ld1 { v1.b }[7], [x10] ; CHECK-NEXT: add x10, sp, #568 -; CHECK-NEXT: ld1 { v5.b }[7], [x14] -; CHECK-NEXT: ld1 { v3.b }[6], [x9] +; CHECK-NEXT: ld1 { v0.b }[7], [x9] +; CHECK-NEXT: ld1 { v4.b }[4], [x14] ; CHECK-NEXT: add x9, sp, #368 -; CHECK-NEXT: ld1 { v6.b }[2], [x9] -; CHECK-NEXT: add x11, sp, #488 -; CHECK-NEXT: ldr b7, [sp, #144] -; CHECK-NEXT: mov v4.b[7], w7 ; CHECK-NEXT: ld1 { v18.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #376 -; CHECK-NEXT: sshll v17.8h, v5.8b, #0 -; CHECK-NEXT: ldr b5, [sp, #480] -; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v6.b }[3], [x10] +; CHECK-NEXT: add x11, sp, #496 +; CHECK-NEXT: ld1 { v16.b }[2], [x9] +; CHECK-NEXT: fmov s5, w0 +; CHECK-NEXT: add x9, sp, #192 +; CHECK-NEXT: ld1 { v6.b }[2], [x11] ; CHECK-NEXT: add x10, sp, #576 -; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ld1 { v5.b }[1], [x11] +; CHECK-NEXT: ld1 { v4.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #376 ; CHECK-NEXT: ld1 { v18.b }[3], [x10] -; CHECK-NEXT: add x11, sp, #496 -; CHECK-NEXT: sshll v16.8h, v4.8b, #0 -; CHECK-NEXT: ldr b4, [sp, #344] -; CHECK-NEXT: add x10, sp, #384 -; CHECK-NEXT: ld1 { v6.b }[4], [x10] +; CHECK-NEXT: add x11, sp, #504 +; CHECK-NEXT: ld1 { v16.b }[3], [x9] +; CHECK-NEXT: mov v5.b[1], w1 +; CHECK-NEXT: ldr b7, [sp, #144] +; CHECK-NEXT: ldr b17, [sp, #344] +; CHECK-NEXT: add x9, sp, #200 +; CHECK-NEXT: ld1 { v6.b }[3], [x11] ; CHECK-NEXT: add x10, sp, #584 -; CHECK-NEXT: ld1 { v2.b }[4], [x8] -; CHECK-NEXT: sshll v19.8h, v4.8b, #0 -; CHECK-NEXT: ld1 { v5.b }[2], [x11] +; CHECK-NEXT: ld1 { v4.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #384 ; CHECK-NEXT: ld1 { v18.b }[4], [x10] -; CHECK-NEXT: smull2 v4.4s, v16.8h, v17.8h -; CHECK-NEXT: smull v16.4s, v16.4h, v17.4h -; CHECK-NEXT: ldr b17, [sp, #416] -; CHECK-NEXT: add x11, sp, #504 -; CHECK-NEXT: add x10, sp, #424 -; CHECK-NEXT: add x16, sp, #320 -; CHECK-NEXT: smull v19.4s, v7.4h, v19.4h -; CHECK-NEXT: movi v7.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v5.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #392 -; CHECK-NEXT: ld1 { v17.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #592 -; CHECK-NEXT: ld1 { v2.b }[5], [x16] -; CHECK-NEXT: ld1 { v6.b }[5], [x11] -; CHECK-NEXT: ld1 { v18.b }[5], [x10] +; CHECK-NEXT: sshll v7.8h, v7.8b, #0 +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 ; CHECK-NEXT: add x11, sp, #512 -; CHECK-NEXT: add x10, sp, #432 +; CHECK-NEXT: ld1 { v16.b }[4], [x9] +; CHECK-NEXT: ld1 { v6.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #592 +; CHECK-NEXT: mov v5.b[2], w2 +; CHECK-NEXT: add x10, sp, #392 +; CHECK-NEXT: ldr b19, [sp, #680] +; CHECK-NEXT: ld1 { v18.b }[5], [x11] +; CHECK-NEXT: smull v7.4s, v7.4h, v17.4h +; CHECK-NEXT: ldr b17, [sp, #416] +; CHECK-NEXT: ld1 { v16.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #688 ; CHECK-NEXT: add x12, sp, #328 -; CHECK-NEXT: mov v7.s[0], v19.s[0] -; CHECK-NEXT: ld1 { v5.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #400 -; CHECK-NEXT: ld1 { v17.b }[2], [x10] +; CHECK-NEXT: add x9, sp, #424 +; CHECK-NEXT: ld1 { v19.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #600 -; CHECK-NEXT: ldr b19, [sp, #680] ; CHECK-NEXT: ldr b20, [sp, #616] ; CHECK-NEXT: ld1 { v2.b }[6], [x12] -; CHECK-NEXT: ld1 { v6.b }[6], [x11] +; CHECK-NEXT: ld1 { v17.b }[1], [x9] +; CHECK-NEXT: add x11, sp, #400 ; CHECK-NEXT: ld1 { v18.b }[6], [x10] -; CHECK-NEXT: add x11, sp, #688 ; CHECK-NEXT: add x12, sp, #624 -; CHECK-NEXT: ld1 { v19.b }[1], [x11] +; CHECK-NEXT: mov v5.b[3], w3 +; CHECK-NEXT: ld1 { v16.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #696 ; CHECK-NEXT: ld1 { v20.b }[1], [x12] -; CHECK-NEXT: add x10, sp, #408 +; CHECK-NEXT: add x9, sp, #432 +; CHECK-NEXT: ld1 { v19.b }[2], [x11] ; CHECK-NEXT: add x11, sp, #608 -; CHECK-NEXT: add x12, sp, #440 -; CHECK-NEXT: ld1 { v6.b }[7], [x10] +; CHECK-NEXT: ld1 { v17.b }[2], [x9] +; CHECK-NEXT: add x10, sp, #408 ; CHECK-NEXT: ld1 { v18.b }[7], [x11] -; CHECK-NEXT: ld1 { v17.b }[3], [x12] -; CHECK-NEXT: add x10, sp, #696 ; CHECK-NEXT: add x11, sp, #632 -; CHECK-NEXT: ld1 { v19.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #448 +; CHECK-NEXT: ld1 { v16.b }[7], [x10] ; CHECK-NEXT: ld1 { v20.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #640 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: ld1 { v17.b }[4], [x10] +; CHECK-NEXT: mov v5.b[4], w4 ; CHECK-NEXT: add x10, sp, #704 -; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: add x12, sp, #440 ; CHECK-NEXT: ld1 { v19.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: add x12, sp, #520 -; CHECK-NEXT: ld1 { v20.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #648 -; CHECK-NEXT: ldr b21, [sp, #544] -; CHECK-NEXT: smull2 v22.4s, v6.8h, v18.8h -; CHECK-NEXT: smull v6.4s, v6.4h, v18.4h -; CHECK-NEXT: ldr b18, [sp, #744] -; CHECK-NEXT: ld1 { v19.b }[4], [x10] -; CHECK-NEXT: ld1 { v5.b }[5], [x12] -; CHECK-NEXT: add x12, sp, #656 -; CHECK-NEXT: ld1 { v20.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #456 -; CHECK-NEXT: sshll v21.8h, v21.8b, #0 -; CHECK-NEXT: ld1 { v17.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #720 +; CHECK-NEXT: add x10, sp, #448 +; CHECK-NEXT: ld1 { v17.b }[3], [x12] +; CHECK-NEXT: add x12, sp, #640 +; CHECK-NEXT: sshll v21.8h, v16.8b, #0 +; CHECK-NEXT: ld1 { v20.b }[3], [x12] ; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: add x11, sp, #712 +; CHECK-NEXT: mov v5.b[5], w5 +; CHECK-NEXT: ld1 { v19.b }[4], [x11] +; CHECK-NEXT: add x9, sp, #520 +; CHECK-NEXT: ld1 { v17.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #648 +; CHECK-NEXT: ldr b22, [sp, #544] +; CHECK-NEXT: ld1 { v20.b }[4], [x10] +; CHECK-NEXT: smull2 v16.4s, v21.8h, v18.8h +; CHECK-NEXT: smull v18.4s, v21.4h, v18.4h +; CHECK-NEXT: ldr b21, [sp, #744] +; CHECK-NEXT: add x11, sp, #720 +; CHECK-NEXT: ld1 { v6.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #456 ; CHECK-NEXT: ld1 { v19.b }[5], [x11] +; CHECK-NEXT: mov v5.b[6], w6 +; CHECK-NEXT: ld1 { v17.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #656 +; CHECK-NEXT: sshll v22.8h, v22.8b, #0 +; CHECK-NEXT: sshll v21.8h, v21.8b, #0 +; CHECK-NEXT: ld1 { v20.b }[5], [x9] ; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: add x11, sp, #464 -; CHECK-NEXT: ld1 { v20.b }[5], [x12] -; CHECK-NEXT: ld1 { v5.b }[6], [x10] -; CHECK-NEXT: add x12, sp, #728 -; CHECK-NEXT: add x13, sp, #664 -; CHECK-NEXT: add x8, sp, #72 -; CHECK-NEXT: ld1 { v17.b }[6], [x11] -; CHECK-NEXT: ld1 { v19.b }[6], [x12] -; CHECK-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-NEXT: add x11, sp, #728 +; CHECK-NEXT: ld1 { v6.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #464 +; CHECK-NEXT: ld1 { v19.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #664 +; CHECK-NEXT: ld1 { v17.b }[6], [x10] +; CHECK-NEXT: smull v21.4s, v22.4h, v21.4h +; CHECK-NEXT: movi v22.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v20.b }[6], [x11] +; CHECK-NEXT: mov v5.b[7], w7 +; CHECK-NEXT: add x9, sp, #536 +; CHECK-NEXT: add x10, sp, #736 +; CHECK-NEXT: add x11, sp, #208 +; CHECK-NEXT: add x13, sp, #264 +; CHECK-NEXT: ld1 { v6.b }[7], [x9] +; CHECK-NEXT: ld1 { v19.b }[7], [x10] +; CHECK-NEXT: ld1 { v4.b }[7], [x11] +; CHECK-NEXT: add x9, sp, #472 +; CHECK-NEXT: add x10, sp, #672 +; CHECK-NEXT: ld1 { v3.b }[6], [x13] +; CHECK-NEXT: ld1 { v17.b }[7], [x9] +; CHECK-NEXT: ld1 { v20.b }[7], [x10] ; CHECK-NEXT: add x8, sp, #336 -; CHECK-NEXT: ld1 { v20.b }[6], [x13] -; CHECK-NEXT: add x9, sp, #272 -; CHECK-NEXT: smull v18.4s, v21.4h, v18.4h +; CHECK-NEXT: mov v22.s[0], v21.s[0] ; CHECK-NEXT: movi v21.2d, #0000000000000000 -; CHECK-NEXT: add x10, sp, #536 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: sshll v19.8h, v19.8b, #0 ; CHECK-NEXT: ld1 { v2.b }[7], [x8] -; CHECK-NEXT: ld1 { v3.b }[7], [x9] -; CHECK-NEXT: ld1 { v5.b }[7], [x10] -; CHECK-NEXT: add x8, sp, #472 -; CHECK-NEXT: add x9, sp, #736 -; CHECK-NEXT: add x10, sp, #672 -; CHECK-NEXT: ld1 { v17.b }[7], [x8] -; CHECK-NEXT: ld1 { v19.b }[7], [x9] -; CHECK-NEXT: ld1 { v20.b }[7], [x10] +; CHECK-NEXT: add x8, sp, #272 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 +; CHECK-NEXT: sshll v20.8h, v20.8b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: mov v21.s[0], v18.s[0] ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: smlal v18.4s, v6.4h, v19.4h +; CHECK-NEXT: smlal2 v16.4s, v6.8h, v19.8h +; CHECK-NEXT: mov v21.s[0], v7.s[0] +; CHECK-NEXT: smull v6.4s, v5.4h, v4.4h ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: sshll v18.8h, v19.8b, #0 -; CHECK-NEXT: sshll v19.8h, v20.8b, #0 -; CHECK-NEXT: smlal v16.4s, v0.4h, v2.4h +; CHECK-NEXT: smlal v22.4s, v17.4h, v20.4h +; CHECK-NEXT: smull2 v4.4s, v5.8h, v4.8h +; CHECK-NEXT: smlal v21.4s, v1.4h, v3.4h +; CHECK-NEXT: smlal2 v16.4s, v17.8h, v20.8h +; CHECK-NEXT: smlal v6.4s, v0.4h, v2.4h +; CHECK-NEXT: add v5.4s, v18.4s, v22.4s ; CHECK-NEXT: smlal2 v4.4s, v0.8h, v2.8h -; CHECK-NEXT: smlal v7.4s, v1.4h, v3.4h -; CHECK-NEXT: smlal v6.4s, v5.4h, v18.4h -; CHECK-NEXT: smlal2 v22.4s, v5.8h, v18.8h -; CHECK-NEXT: smlal v21.4s, v17.4h, v19.4h +; CHECK-NEXT: add v0.4s, v6.4s, v21.4s +; CHECK-NEXT: add v2.4s, v5.4s, v16.4s ; CHECK-NEXT: smlal2 v4.4s, v1.8h, v3.8h -; CHECK-NEXT: add v0.4s, v16.4s, v7.4s -; CHECK-NEXT: add v1.4s, v6.4s, v21.4s -; CHECK-NEXT: smlal2 v22.4s, v17.8h, v19.8h +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: add v1.4s, v1.4s, v22.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1584,34 +1584,34 @@ entry: define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v33i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: movi v18.2d, #0000000000000000 -; CHECK-NEXT: ldp q4, q5, [x1] ; CHECK-NEXT: ldr b0, [x0, #32] ; CHECK-NEXT: ldr b1, [x1, #32] +; CHECK-NEXT: ldp q2, q4, [x0] +; CHECK-NEXT: ldp q3, q6, [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v6.8h, v2.8b, #0 -; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v7.8h, v4.8b, #0 -; CHECK-NEXT: ushll2 v4.8h, v4.16b, #0 -; CHECK-NEXT: ushll2 v16.8h, v3.16b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll2 v19.8h, v5.16b, #0 -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: ushll v5.8h, v2.8b, #0 +; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: ushll2 v16.8h, v4.16b, #0 +; CHECK-NEXT: ushll v7.8h, v3.8b, #0 +; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 ; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h -; CHECK-NEXT: umull2 v1.4s, v7.8h, v6.8h -; CHECK-NEXT: umull2 v17.4s, v4.8h, v2.8h -; CHECK-NEXT: umull v2.4s, v4.4h, v2.4h -; CHECK-NEXT: umlal2 v17.4s, v19.8h, v16.8h -; CHECK-NEXT: umlal2 v1.4s, v5.8h, v3.8h -; CHECK-NEXT: mov v18.s[0], v0.s[0] -; CHECK-NEXT: umlal v2.4s, v19.4h, v16.4h -; CHECK-NEXT: add v0.4s, v1.4s, v17.4s -; CHECK-NEXT: umlal v18.4s, v7.4h, v6.4h -; CHECK-NEXT: umlal v18.4s, v5.4h, v3.4h -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v0.4s, v18.4s, v0.4s +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ushll2 v19.8h, v6.16b, #0 +; CHECK-NEXT: ushll v6.8h, v6.8b, #0 +; CHECK-NEXT: umull2 v17.4s, v7.8h, v5.8h +; CHECK-NEXT: umull2 v18.4s, v3.8h, v2.8h +; CHECK-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NEXT: umull v0.4s, v3.4h, v2.4h +; CHECK-NEXT: umlal2 v18.4s, v19.8h, v16.8h +; CHECK-NEXT: umlal2 v17.4s, v6.8h, v4.8h +; CHECK-NEXT: umlal v1.4s, v7.4h, v5.4h +; CHECK-NEXT: umlal v0.4s, v19.4h, v16.4h +; CHECK-NEXT: add v2.4s, v17.4s, v18.4s +; CHECK-NEXT: umlal v1.4s, v6.4h, v4.4h +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 @@ -1639,14 +1639,14 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) { ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: uaddl2 v6.4s, v3.8h, v2.8h -; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h +; CHECK-NEXT: uaddl2 v6.4s, v5.8h, v4.8h ; CHECK-NEXT: mov v0.s[0], v1.s[0] -; CHECK-NEXT: uaddl2 v1.4s, v5.8h, v4.8h -; CHECK-NEXT: add v1.4s, v1.4s, v6.4s +; CHECK-NEXT: uaddl2 v1.4s, v3.8h, v2.8h +; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s ; CHECK-NEXT: uaddw v0.4s, v0.4s, v5.4h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v4.4h ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: uaddw v0.4s, v0.4s, v4.4h ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 @@ -1660,34 +1660,34 @@ entry: define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v33i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: movi v18.2d, #0000000000000000 -; CHECK-NEXT: ldp q4, q5, [x1] ; CHECK-NEXT: ldr b0, [x0, #32] ; CHECK-NEXT: ldr b1, [x1, #32] +; CHECK-NEXT: ldp q2, q4, [x0] +; CHECK-NEXT: ldp q3, q6, [x1] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll v6.8h, v2.8b, #0 -; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: sshll v7.8h, v4.8b, #0 -; CHECK-NEXT: sshll2 v4.8h, v4.16b, #0 -; CHECK-NEXT: sshll2 v16.8h, v3.16b, #0 -; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: sshll2 v19.8h, v5.16b, #0 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: sshll v5.8h, v2.8b, #0 +; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: sshll2 v16.8h, v4.16b, #0 +; CHECK-NEXT: sshll v7.8h, v3.8b, #0 +; CHECK-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h -; CHECK-NEXT: smull2 v1.4s, v7.8h, v6.8h -; CHECK-NEXT: smull2 v17.4s, v4.8h, v2.8h -; CHECK-NEXT: smull v2.4s, v4.4h, v2.4h -; CHECK-NEXT: smlal2 v17.4s, v19.8h, v16.8h -; CHECK-NEXT: smlal2 v1.4s, v5.8h, v3.8h -; CHECK-NEXT: mov v18.s[0], v0.s[0] -; CHECK-NEXT: smlal v2.4s, v19.4h, v16.4h -; CHECK-NEXT: add v0.4s, v1.4s, v17.4s -; CHECK-NEXT: smlal v18.4s, v7.4h, v6.4h -; CHECK-NEXT: smlal v18.4s, v5.4h, v3.4h -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v0.4s, v18.4s, v0.4s +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sshll2 v19.8h, v6.16b, #0 +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: smull2 v17.4s, v7.8h, v5.8h +; CHECK-NEXT: smull2 v18.4s, v3.8h, v2.8h +; CHECK-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NEXT: smull v0.4s, v3.4h, v2.4h +; CHECK-NEXT: smlal2 v18.4s, v19.8h, v16.8h +; CHECK-NEXT: smlal2 v17.4s, v6.8h, v4.8h +; CHECK-NEXT: smlal v1.4s, v7.4h, v5.4h +; CHECK-NEXT: smlal v0.4s, v19.4h, v16.4h +; CHECK-NEXT: add v2.4s, v17.4s, v18.4s +; CHECK-NEXT: smlal v1.4s, v6.4h, v4.4h +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 @@ -2018,151 +2018,151 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr b1, [sp, #80] +; CHECK-NEXT: ldr b0, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b2, [sp, #144] -; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ldr b3, [sp, #16] -; CHECK-NEXT: add x12, sp, #32 -; CHECK-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-NEXT: ld1 { v2.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: add x8, sp, #24 -; CHECK-NEXT: add x11, sp, #112 -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ld1 { v3.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #160 -; CHECK-NEXT: ldr b4, [sp, #480] -; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #104 -; CHECK-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-NEXT: add x10, sp, #152 +; CHECK-NEXT: add x11, sp, #160 +; CHECK-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: add x8, sp, #104 +; CHECK-NEXT: ldr b3, [sp, #16] +; CHECK-NEXT: ldr b5, [sp, #480] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: add x10, sp, #24 +; CHECK-NEXT: add x13, sp, #488 +; CHECK-NEXT: ldr b4, [sp, #608] +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: ld1 { v2.b }[2], [x11] +; CHECK-NEXT: add x12, sp, #112 +; CHECK-NEXT: ld1 { v3.b }[1], [x10] +; CHECK-NEXT: ld1 { v5.b }[1], [x13] +; CHECK-NEXT: add x10, sp, #616 +; CHECK-NEXT: mov v1.b[1], w1 +; CHECK-NEXT: ld1 { v4.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #32 +; CHECK-NEXT: ld1 { v0.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #168 -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: ld1 { v3.b }[2], [x12] -; CHECK-NEXT: add x12, sp, #40 -; CHECK-NEXT: ldr b5, [sp, #608] -; CHECK-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #120 ; CHECK-NEXT: ld1 { v2.b }[3], [x8] -; CHECK-NEXT: mov v0.b[1], w1 -; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: add x14, sp, #184 -; CHECK-NEXT: ldr b16, [sp, #544] -; CHECK-NEXT: ld1 { v3.b }[3], [x12] -; CHECK-NEXT: add x12, sp, #176 -; CHECK-NEXT: ldr b17, [sp, #672] -; CHECK-NEXT: ld1 { v1.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #488 -; CHECK-NEXT: ld1 { v2.b }[4], [x12] -; CHECK-NEXT: ld1 { v4.b }[1], [x11] -; CHECK-NEXT: mov v0.b[2], w2 -; CHECK-NEXT: add x11, sp, #192 -; CHECK-NEXT: ld1 { v3.b }[4], [x13] -; CHECK-NEXT: add x13, sp, #616 -; CHECK-NEXT: add x12, sp, #56 -; CHECK-NEXT: ld1 { v1.b }[5], [x10] -; CHECK-NEXT: ld1 { v5.b }[1], [x13] -; CHECK-NEXT: add x13, sp, #496 +; CHECK-NEXT: add x8, sp, #496 +; CHECK-NEXT: ld1 { v3.b }[2], [x10] +; CHECK-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #176 +; CHECK-NEXT: ldr b6, [sp, #544] +; CHECK-NEXT: ld1 { v0.b }[4], [x12] +; CHECK-NEXT: add x14, sp, #552 +; CHECK-NEXT: ldr b7, [sp, #672] +; CHECK-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-NEXT: add x13, sp, #40 +; CHECK-NEXT: ld1 { v6.b }[1], [x14] +; CHECK-NEXT: mov v1.b[2], w2 +; CHECK-NEXT: add x11, sp, #128 +; CHECK-NEXT: ld1 { v3.b }[3], [x13] +; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #680 +; CHECK-NEXT: add x13, sp, #184 +; CHECK-NEXT: ld1 { v7.b }[1], [x9] +; CHECK-NEXT: ld1 { v2.b }[5], [x13] +; CHECK-NEXT: add x13, sp, #624 +; CHECK-NEXT: add x15, sp, #504 ; CHECK-NEXT: ld1 { v4.b }[2], [x13] -; CHECK-NEXT: ld1 { v2.b }[5], [x14] -; CHECK-NEXT: add x14, sp, #680 -; CHECK-NEXT: ld1 { v17.b }[1], [x14] -; CHECK-NEXT: add x13, sp, #504 -; CHECK-NEXT: ld1 { v3.b }[5], [x12] -; CHECK-NEXT: ld1 { v1.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #552 -; CHECK-NEXT: add x12, sp, #688 -; CHECK-NEXT: ld1 { v16.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #624 -; CHECK-NEXT: ld1 { v4.b }[3], [x13] -; CHECK-NEXT: ld1 { v2.b }[6], [x11] +; CHECK-NEXT: add x10, sp, #136 +; CHECK-NEXT: ld1 { v0.b }[6], [x11] ; CHECK-NEXT: add x11, sp, #560 -; CHECK-NEXT: add x8, sp, #136 -; CHECK-NEXT: ld1 { v17.b }[2], [x12] -; CHECK-NEXT: ld1 { v5.b }[2], [x9] -; CHECK-NEXT: ld1 { v1.b }[7], [x8] -; CHECK-NEXT: ld1 { v16.b }[2], [x11] -; CHECK-NEXT: add x8, sp, #512 -; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v4.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #568 -; CHECK-NEXT: add x9, sp, #696 -; CHECK-NEXT: add x11, sp, #632 -; CHECK-NEXT: ld1 { v17.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #520 -; CHECK-NEXT: ld1 { v16.b }[3], [x8] -; CHECK-NEXT: ld1 { v5.b }[3], [x11] -; CHECK-NEXT: add x8, sp, #640 -; CHECK-NEXT: ld1 { v4.b }[5], [x9] +; CHECK-NEXT: ld1 { v5.b }[3], [x15] +; CHECK-NEXT: ld1 { v6.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #688 +; CHECK-NEXT: mov v1.b[3], w3 +; CHECK-NEXT: ld1 { v7.b }[2], [x11] +; CHECK-NEXT: add x9, sp, #632 +; CHECK-NEXT: add x11, sp, #512 +; CHECK-NEXT: ld1 { v0.b }[7], [x10] +; CHECK-NEXT: ld1 { v4.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #568 +; CHECK-NEXT: add x10, sp, #696 +; CHECK-NEXT: ld1 { v6.b }[3], [x9] +; CHECK-NEXT: ld1 { v5.b }[4], [x11] +; CHECK-NEXT: ld1 { v7.b }[3], [x10] +; CHECK-NEXT: add x9, sp, #640 +; CHECK-NEXT: mov v1.b[4], w4 +; CHECK-NEXT: ld1 { v4.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #576 -; CHECK-NEXT: add x11, sp, #704 +; CHECK-NEXT: add x10, sp, #704 +; CHECK-NEXT: add x11, sp, #520 +; CHECK-NEXT: ld1 { v6.b }[4], [x9] ; CHECK-NEXT: ldr b18, [sp, #736] -; CHECK-NEXT: mov v0.b[4], w4 -; CHECK-NEXT: ld1 { v17.b }[4], [x11] -; CHECK-NEXT: ld1 { v16.b }[4], [x9] -; CHECK-NEXT: ld1 { v5.b }[4], [x8] -; CHECK-NEXT: add x9, sp, #528 -; CHECK-NEXT: sshll v18.8h, v18.8b, #0 -; CHECK-NEXT: add x8, sp, #648 +; CHECK-NEXT: ld1 { v7.b }[4], [x10] +; CHECK-NEXT: ld1 { v5.b }[5], [x11] +; CHECK-NEXT: add x12, sp, #192 +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: ld1 { v2.b }[6], [x12] +; CHECK-NEXT: add x9, sp, #648 +; CHECK-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-NEXT: add x10, sp, #528 ; CHECK-NEXT: add x11, sp, #584 ; CHECK-NEXT: add x12, sp, #712 -; CHECK-NEXT: ld1 { v4.b }[6], [x9] -; CHECK-NEXT: movi v7.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v16.b }[5], [x11] -; CHECK-NEXT: ld1 { v17.b }[5], [x12] -; CHECK-NEXT: ld1 { v5.b }[5], [x8] -; CHECK-NEXT: mov v0.b[5], w5 -; CHECK-NEXT: add x9, sp, #536 -; CHECK-NEXT: sshll v18.4s, v18.4h, #0 -; CHECK-NEXT: add x8, sp, #656 +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: mov v1.b[5], w5 +; CHECK-NEXT: ld1 { v6.b }[5], [x11] +; CHECK-NEXT: ld1 { v7.b }[5], [x12] +; CHECK-NEXT: ld1 { v4.b }[5], [x9] +; CHECK-NEXT: ld1 { v5.b }[6], [x10] +; CHECK-NEXT: add x14, sp, #56 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v3.b }[5], [x14] +; CHECK-NEXT: add x9, sp, #656 +; CHECK-NEXT: add x10, sp, #536 ; CHECK-NEXT: add x11, sp, #592 ; CHECK-NEXT: add x12, sp, #720 -; CHECK-NEXT: ld1 { v4.b }[7], [x9] -; CHECK-NEXT: ld1 { v16.b }[6], [x11] -; CHECK-NEXT: ld1 { v17.b }[6], [x12] -; CHECK-NEXT: ld1 { v5.b }[6], [x8] -; CHECK-NEXT: ldr b6, [sp, #208] -; CHECK-NEXT: add x10, sp, #64 -; CHECK-NEXT: mov v7.s[0], v18.s[0] -; CHECK-NEXT: mov v0.b[6], w6 -; CHECK-NEXT: ld1 { v3.b }[6], [x10] +; CHECK-NEXT: sshll v18.4s, v18.4h, #0 +; CHECK-NEXT: ldr b16, [sp, #208] +; CHECK-NEXT: ld1 { v6.b }[6], [x11] +; CHECK-NEXT: ld1 { v7.b }[6], [x12] +; CHECK-NEXT: ld1 { v4.b }[6], [x9] +; CHECK-NEXT: ld1 { v5.b }[7], [x10] +; CHECK-NEXT: add x8, sp, #64 +; CHECK-NEXT: mov v1.b[6], w6 +; CHECK-NEXT: sshll v16.8h, v16.8b, #0 +; CHECK-NEXT: ld1 { v3.b }[6], [x8] ; CHECK-NEXT: add x8, sp, #664 ; CHECK-NEXT: add x9, sp, #600 ; CHECK-NEXT: add x10, sp, #728 -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: ld1 { v16.b }[7], [x9] -; CHECK-NEXT: ld1 { v17.b }[7], [x10] -; CHECK-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-NEXT: mov v17.s[0], v18.s[0] +; CHECK-NEXT: ld1 { v6.b }[7], [x9] +; CHECK-NEXT: ld1 { v7.b }[7], [x10] +; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 ; CHECK-NEXT: movi v18.2d, #0000000000000000 -; CHECK-NEXT: mov v0.b[7], w7 +; CHECK-NEXT: sshll v16.4s, v16.4h, #0 +; CHECK-NEXT: mov v1.b[7], w7 ; CHECK-NEXT: add x9, sp, #200 -; CHECK-NEXT: add x10, sp, #72 -; CHECK-NEXT: saddw v7.4s, v7.4s, v4.4h -; CHECK-NEXT: sshll v6.4s, v6.4h, #0 -; CHECK-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: add x8, sp, #72 ; CHECK-NEXT: ld1 { v2.b }[7], [x9] -; CHECK-NEXT: ld1 { v3.b }[7], [x10] -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: mov v18.s[0], v6.s[0] +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-NEXT: sshll v7.8h, v7.8b, #0 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: saddw v17.4s, v17.4s, v5.4h +; CHECK-NEXT: mov v18.s[0], v16.s[0] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: saddl2 v6.4s, v17.8h, v16.8h -; CHECK-NEXT: saddl2 v4.4s, v5.8h, v4.8h -; CHECK-NEXT: saddl v16.4s, v17.4h, v16.4h -; CHECK-NEXT: saddw v5.4s, v7.4s, v5.4h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: saddl2 v17.4s, v0.8h, v1.8h -; CHECK-NEXT: saddw v0.4s, v18.4s, v0.4h +; CHECK-NEXT: saddl2 v16.4s, v7.8h, v6.8h +; CHECK-NEXT: saddl2 v5.4s, v4.8h, v5.8h +; CHECK-NEXT: saddl v6.4s, v7.4h, v6.4h +; CHECK-NEXT: saddw v4.4s, v17.4s, v4.4h +; CHECK-NEXT: saddl2 v17.4s, v1.8h, v0.8h +; CHECK-NEXT: saddw v1.4s, v18.4s, v1.4h ; CHECK-NEXT: saddl2 v7.4s, v3.8h, v2.8h -; CHECK-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h ; CHECK-NEXT: add v5.4s, v5.4s, v16.4s -; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h +; CHECK-NEXT: add v4.4s, v4.4s, v6.4s +; CHECK-NEXT: saddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: add v6.4s, v17.4s, v7.4s -; CHECK-NEXT: add v1.4s, v5.4s, v4.4s +; CHECK-NEXT: add v1.4s, v4.4s, v5.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v1.4s, v6.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll index 913205f3275367..16200435c5c31d 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -515,15 +515,15 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; CHECK-NEXT: mov v0.b[6], w6 ; CHECK-NEXT: ld1 { v1.b }[7], [x9] ; CHECK-NEXT: uaddl v2.8h, v3.8b, v2.8b +; CHECK-NEXT: ushll v3.4s, v4.4h, #0 ; CHECK-NEXT: mov v0.b[7], w7 -; CHECK-NEXT: ushll2 v3.4s, v2.8h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: ushll v1.4s, v4.4h, #0 -; CHECK-NEXT: stp q3, q1, [x8, #48] -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: stp q1, q3, [x8, #48] +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: stp q1, q2, [x8, #16] +; CHECK-NEXT: stp q3, q2, [x8, #16] ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/neon-shift-neg.ll b/llvm/test/CodeGen/AArch64/neon-shift-neg.ll index 881bbf315e8e99..45272143e8592f 100644 --- a/llvm/test/CodeGen/AArch64/neon-shift-neg.ll +++ b/llvm/test/CodeGen/AArch64/neon-shift-neg.ll @@ -375,8 +375,8 @@ entry: define @shrn64x2( %a, i64 %b) { ; CHECK-LABEL: shrn64x2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: neg x8, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -391,8 +391,8 @@ entry: define @shrn32x4( %a, i32 %b) { ; CHECK-LABEL: shrn32x4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -407,8 +407,8 @@ entry: define @shrn16x8( %a, i16 %b) { ; CHECK-LABEL: shrn16x8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -423,8 +423,8 @@ entry: define @shrn8x16( %a, i8 %b) { ; CHECK-LABEL: shrn8x16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret @@ -439,8 +439,8 @@ entry: define @lshrn64x2( %a, i64 %b) { ; CHECK-LABEL: lshrn64x2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: neg x8, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -455,8 +455,8 @@ entry: define @lshrn32x4( %a, i32 %b) { ; CHECK-LABEL: lshrn32x4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -471,8 +471,8 @@ entry: define @lshrn16x8( %a, i16 %b) { ; CHECK-LABEL: lshrn16x8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -487,8 +487,8 @@ entry: define @lshrn8x16( %a, i8 %b) { ; CHECK-LABEL: lshrn8x16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret @@ -503,8 +503,8 @@ entry: define @shln64x2( %a, i64 %b) { ; CHECK-LABEL: shln64x2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: neg x8, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -519,8 +519,8 @@ entry: define @shln32x4( %a, i32 %b) { ; CHECK-LABEL: shln32x4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -535,8 +535,8 @@ entry: define @shln16x8( %a, i16 %b) { ; CHECK-LABEL: shln16x8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -551,8 +551,8 @@ entry: define @shln8x16( %a, i8 %b) { ; CHECK-LABEL: shln8x16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/predicated-add-sub.ll b/llvm/test/CodeGen/AArch64/predicated-add-sub.ll index 6b3cfc040cb3d4..20088354bdb755 100644 --- a/llvm/test/CodeGen/AArch64/predicated-add-sub.ll +++ b/llvm/test/CodeGen/AArch64/predicated-add-sub.ll @@ -83,11 +83,11 @@ define @zext.add.2xi64( %a, @zext.add.8xi32( %a, %v) #0 { ; CHECK-LABEL: zext.add.8xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: mov z2.s, #1 // =0x1 +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: add z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: add z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: add z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: ret %extend = zext %v to %result = add %a, %extend @@ -103,8 +103,8 @@ define @zext.add.16xi32( %a, @zext.sub.2xi64( %a, @zext.sub.8xi32( %a, %v) #0 { ; CHECK-LABEL: zext.sub.8xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: add z0.s, p1/m, z0.s, z2.s ; CHECK-NEXT: add z1.s, p0/m, z1.s, z2.s @@ -214,8 +214,8 @@ define @zext.sub.16xi32( %a, @sext.add.2xi64( %a, @sext.add.8xi32( %a, %v) #0 { ; CHECK-LABEL: sext.add.8xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: add z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: add z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: add z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: ret %extend = sext %v to %result = add %a, %extend @@ -325,8 +325,8 @@ define @sext.add.16xi32( %a, @sext.sub.2xi64( %a, @sext.sub.8xi32( %a, %v) #0 { ; CHECK-LABEL: sext.sub.8xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: sub z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: sub z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: sub z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: ret %extend = sext %v to %result = sub %a, %extend @@ -436,8 +436,8 @@ define @sext.sub.16xi32( %a, This Inner Loop Header: Depth=1 +; CHECK-NEXT: str q14, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: ldr q14, [x8] ; CHECK-NEXT: mov x12, xzr -; CHECK-NEXT: str q18, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: ldr x14, [x12] -; CHECK-NEXT: ldr q15, [x12] -; CHECK-NEXT: add x7, x11, x8 +; CHECK-NEXT: stp q29, q15, [sp] // 32-byte Folded Spill +; CHECK-NEXT: add x19, x11, x8 ; CHECK-NEXT: fmov x15, d14 ; CHECK-NEXT: mov x16, v14.d[1] -; CHECK-NEXT: ldr q18, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov x18, d15 -; CHECK-NEXT: mov x13, v15.d[1] -; CHECK-NEXT: ldr x5, [x8] +; CHECK-NEXT: ldr q15, [x12] ; CHECK-NEXT: ldr q14, [x10], #64 -; CHECK-NEXT: ldr x7, [x7, #128] +; CHECK-NEXT: mov v8.16b, v28.16b +; CHECK-NEXT: fmov x13, d15 +; CHECK-NEXT: mov x18, v15.d[1] +; CHECK-NEXT: mov v28.16b, v24.16b ; CHECK-NEXT: mul x17, x15, x14 -; CHECK-NEXT: mov v6.16b, v0.16b -; CHECK-NEXT: mov v9.16b, v27.16b ; CHECK-NEXT: mov x12, v14.d[1] ; CHECK-NEXT: fmov x4, d14 -; CHECK-NEXT: mov v27.16b, v23.16b +; CHECK-NEXT: mov v24.16b, v20.16b +; CHECK-NEXT: mov v20.16b, v17.16b +; CHECK-NEXT: mov v17.16b, v5.16b ; CHECK-NEXT: mul x1, x16, x14 -; CHECK-NEXT: mov v23.16b, v19.16b -; CHECK-NEXT: mov v19.16b, v7.16b -; CHECK-NEXT: mov v7.16b, v2.16b -; CHECK-NEXT: stp q26, q31, [sp] // 32-byte Folded Spill -; CHECK-NEXT: mov v31.16b, v22.16b -; CHECK-NEXT: mul x0, x18, x14 -; CHECK-NEXT: mov v26.16b, v10.16b -; CHECK-NEXT: mov v22.16b, v5.16b +; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x5, [x8] +; CHECK-NEXT: ldr x19, [x19, #128] +; CHECK-NEXT: mov v29.16b, v21.16b +; CHECK-NEXT: mov v21.16b, v0.16b +; CHECK-NEXT: mul x0, x13, x14 +; CHECK-NEXT: mov v25.16b, v6.16b +; CHECK-NEXT: mov v6.16b, v2.16b ; CHECK-NEXT: fmov d15, x17 -; CHECK-NEXT: mov v5.16b, v1.16b -; CHECK-NEXT: mov v8.16b, v20.16b -; CHECK-NEXT: mul x2, x13, x14 -; CHECK-NEXT: mov v20.16b, v16.16b -; CHECK-NEXT: mov v16.16b, v3.16b -; CHECK-NEXT: mov v10.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v17.16b -; CHECK-NEXT: mov v17.16b, v4.16b +; CHECK-NEXT: mov v26.16b, v22.16b +; CHECK-NEXT: mov v22.16b, v18.16b +; CHECK-NEXT: mul x2, x18, x14 +; CHECK-NEXT: mov v18.16b, v7.16b +; CHECK-NEXT: mov v7.16b, v3.16b +; CHECK-NEXT: mov v16.16b, v4.16b +; CHECK-NEXT: add x8, x8, #8 +; CHECK-NEXT: add x9, x9, #1 ; CHECK-NEXT: mov v15.d[1], x1 ; CHECK-NEXT: mul x3, x12, x14 -; CHECK-NEXT: add x8, x8, #8 -; CHECK-NEXT: fmov d14, x0 ; CHECK-NEXT: cmp x8, #64 -; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: fmov d14, x0 ; CHECK-NEXT: mul x14, x4, x14 -; CHECK-NEXT: add v18.2d, v18.2d, v15.2d -; CHECK-NEXT: mul x19, x15, x5 +; CHECK-NEXT: add v5.2d, v5.2d, v15.2d +; CHECK-NEXT: mul x20, x15, x5 ; CHECK-NEXT: mov v14.d[1], x2 -; CHECK-NEXT: mul x15, x15, x7 +; CHECK-NEXT: mul x15, x15, x19 ; CHECK-NEXT: fmov d0, x14 -; CHECK-NEXT: str q18, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: ldp q18, q15, [sp, #32] // 32-byte Folded Reload -; CHECK-NEXT: mul x6, x16, x5 -; CHECK-NEXT: fmov d1, x19 +; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: ldr q5, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: mul x21, x13, x19 +; CHECK-NEXT: add v5.2d, v5.2d, v14.2d +; CHECK-NEXT: fmov d3, x20 +; CHECK-NEXT: mul x7, x16, x5 ; CHECK-NEXT: mov v0.d[1], x3 -; CHECK-NEXT: mul x16, x16, x7 -; CHECK-NEXT: fmov d2, x15 -; CHECK-NEXT: add v15.2d, v15.2d, v14.2d -; CHECK-NEXT: mul x21, x18, x7 -; CHECK-NEXT: mov v1.d[1], x6 -; CHECK-NEXT: mul x0, x4, x7 -; CHECK-NEXT: str q15, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: add v15.2d, v11.2d, v14.2d -; CHECK-NEXT: mov v2.d[1], x16 -; CHECK-NEXT: ldr q11, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: mul x20, x13, x7 -; CHECK-NEXT: fmov d3, x21 -; CHECK-NEXT: add v11.2d, v11.2d, v0.2d -; CHECK-NEXT: add v12.2d, v12.2d, v1.2d -; CHECK-NEXT: mul x22, x12, x7 -; CHECK-NEXT: fmov d4, x0 -; CHECK-NEXT: add v18.2d, v18.2d, v2.2d -; CHECK-NEXT: mov v2.16b, v7.16b -; CHECK-NEXT: mul x14, x18, x5 -; CHECK-NEXT: mov v7.16b, v19.16b -; CHECK-NEXT: mov v19.16b, v23.16b -; CHECK-NEXT: mov v3.d[1], x20 -; CHECK-NEXT: mov v23.16b, v27.16b -; CHECK-NEXT: add v27.2d, v9.2d, v1.2d -; CHECK-NEXT: mul x15, x4, x5 -; CHECK-NEXT: str q11, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: mov v11.16b, v15.16b -; CHECK-NEXT: mov v4.d[1], x22 -; CHECK-NEXT: add v19.2d, v19.2d, v1.2d -; CHECK-NEXT: add v7.2d, v7.2d, v1.2d +; CHECK-NEXT: fmov d1, x15 +; CHECK-NEXT: mul x16, x16, x19 +; CHECK-NEXT: str q5, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: add v5.2d, v13.2d, v14.2d +; CHECK-NEXT: fmov d2, x21 +; CHECK-NEXT: ldr q13, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: mul x6, x18, x5 +; CHECK-NEXT: ldp q15, q14, [sp, #16] // 32-byte Folded Reload +; CHECK-NEXT: mov v3.d[1], x7 +; CHECK-NEXT: add v13.2d, v13.2d, v0.2d +; CHECK-NEXT: mul x18, x18, x19 +; CHECK-NEXT: mov v1.d[1], x16 +; CHECK-NEXT: mul x22, x4, x19 +; CHECK-NEXT: str q13, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov v13.16b, v5.16b +; CHECK-NEXT: mov v5.16b, v17.16b +; CHECK-NEXT: mov v17.16b, v20.16b +; CHECK-NEXT: mov v20.16b, v24.16b ; CHECK-NEXT: mul x13, x13, x5 -; CHECK-NEXT: add v23.2d, v23.2d, v1.2d -; CHECK-NEXT: add v1.2d, v5.2d, v1.2d -; CHECK-NEXT: fmov d14, x14 -; CHECK-NEXT: add v30.2d, v30.2d, v3.2d -; CHECK-NEXT: mov v3.16b, v16.16b +; CHECK-NEXT: mov v24.16b, v28.16b +; CHECK-NEXT: add v11.2d, v11.2d, v3.2d +; CHECK-NEXT: mov v2.d[1], x18 +; CHECK-NEXT: add v15.2d, v15.2d, v1.2d +; CHECK-NEXT: add v27.2d, v27.2d, v3.2d +; CHECK-NEXT: mul x17, x12, x19 +; CHECK-NEXT: add v23.2d, v23.2d, v3.2d +; CHECK-NEXT: add v19.2d, v19.2d, v3.2d +; CHECK-NEXT: fmov d4, x22 +; CHECK-NEXT: add v10.2d, v10.2d, v3.2d +; CHECK-NEXT: mul x14, x4, x5 +; CHECK-NEXT: fmov d0, x13 +; CHECK-NEXT: add v14.2d, v14.2d, v2.2d +; CHECK-NEXT: add v2.2d, v6.2d, v3.2d ; CHECK-NEXT: mul x12, x12, x5 -; CHECK-NEXT: mov v16.16b, v20.16b -; CHECK-NEXT: mov v5.16b, v22.16b -; CHECK-NEXT: fmov d0, x15 -; CHECK-NEXT: add v28.2d, v28.2d, v4.2d -; CHECK-NEXT: mov v4.16b, v17.16b -; CHECK-NEXT: mov v17.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v10.16b -; CHECK-NEXT: mov v10.16b, v26.16b -; CHECK-NEXT: mov v14.d[1], x13 -; CHECK-NEXT: mov v22.16b, v31.16b -; CHECK-NEXT: ldp q26, q31, [sp] // 32-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x12 -; CHECK-NEXT: add v13.2d, v13.2d, v14.2d -; CHECK-NEXT: add v31.2d, v31.2d, v14.2d -; CHECK-NEXT: add v26.2d, v26.2d, v14.2d -; CHECK-NEXT: add v24.2d, v24.2d, v14.2d -; CHECK-NEXT: add v22.2d, v22.2d, v14.2d -; CHECK-NEXT: add v20.2d, v8.2d, v14.2d -; CHECK-NEXT: add v10.2d, v10.2d, v14.2d -; CHECK-NEXT: add v16.2d, v16.2d, v14.2d -; CHECK-NEXT: add v5.2d, v5.2d, v14.2d -; CHECK-NEXT: add v3.2d, v3.2d, v14.2d -; CHECK-NEXT: add v2.2d, v2.2d, v14.2d -; CHECK-NEXT: add v29.2d, v29.2d, v0.2d -; CHECK-NEXT: add v25.2d, v25.2d, v0.2d -; CHECK-NEXT: add v21.2d, v21.2d, v0.2d +; CHECK-NEXT: mov v3.16b, v7.16b +; CHECK-NEXT: mov v7.16b, v18.16b +; CHECK-NEXT: mov v4.d[1], x17 +; CHECK-NEXT: mov v18.16b, v22.16b +; CHECK-NEXT: mov v0.d[1], x6 +; CHECK-NEXT: fmov d1, x14 +; CHECK-NEXT: add v28.2d, v8.2d, v4.2d +; CHECK-NEXT: mov v1.d[1], x12 +; CHECK-NEXT: add v31.2d, v31.2d, v0.2d +; CHECK-NEXT: add v30.2d, v30.2d, v0.2d +; CHECK-NEXT: add v12.2d, v12.2d, v0.2d +; CHECK-NEXT: add v24.2d, v24.2d, v0.2d +; CHECK-NEXT: add v22.2d, v26.2d, v0.2d +; CHECK-NEXT: add v20.2d, v20.2d, v0.2d +; CHECK-NEXT: add v18.2d, v18.2d, v0.2d ; CHECK-NEXT: add v17.2d, v17.2d, v0.2d -; CHECK-NEXT: add v4.2d, v4.2d, v0.2d -; CHECK-NEXT: add v0.2d, v6.2d, v0.2d +; CHECK-NEXT: add v7.2d, v7.2d, v0.2d +; CHECK-NEXT: add v4.2d, v16.2d, v0.2d +; CHECK-NEXT: add v3.2d, v3.2d, v0.2d +; CHECK-NEXT: mov v0.16b, v21.16b +; CHECK-NEXT: mov v21.16b, v29.16b +; CHECK-NEXT: ldr q29, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add v9.2d, v9.2d, v1.2d +; CHECK-NEXT: add v6.2d, v25.2d, v1.2d +; CHECK-NEXT: add v5.2d, v5.2d, v1.2d +; CHECK-NEXT: add v29.2d, v29.2d, v1.2d +; CHECK-NEXT: add v21.2d, v21.2d, v1.2d +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup -; CHECK-NEXT: ldr q6, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: stp q12, q31, [x8, #80] +; CHECK-NEXT: stp q11, q30, [x8, #80] ; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload -; CHECK-NEXT: str q6, [x8] -; CHECK-NEXT: ldr q6, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: str q29, [x8, #112] +; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: stp q15, q14, [x8, #144] ; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload -; CHECK-NEXT: stp q6, q11, [x8, #16] -; CHECK-NEXT: ldr q6, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: stp q18, q30, [x8, #144] -; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload -; CHECK-NEXT: stp q6, q13, [x8, #48] +; CHECK-NEXT: stp q1, q13, [x8, #16] +; CHECK-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: stp q28, q12, [x8, #176] ; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: stp q28, q26, [x8, #176] +; CHECK-NEXT: stp q1, q31, [x8, #48] ; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: stp q19, q10, [x8, #336] +; CHECK-NEXT: stp q9, q24, [x8, #240] +; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload +; CHECK-NEXT: stp q19, q18, [x8, #336] +; CHECK-NEXT: stp q10, q7, [x8, #400] ; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload +; CHECK-NEXT: str q29, [x8, #112] ; CHECK-NEXT: str q27, [x8, #208] -; CHECK-NEXT: stp q25, q24, [x8, #240] ; CHECK-NEXT: stp q23, q22, [x8, #272] ; CHECK-NEXT: stp q21, q20, [x8, #304] -; CHECK-NEXT: stp q17, q16, [x8, #368] -; CHECK-NEXT: stp q7, q5, [x8, #400] -; CHECK-NEXT: stp q4, q3, [x8, #432] -; CHECK-NEXT: stp q1, q2, [x8, #464] +; CHECK-NEXT: stp q6, q17, [x8, #368] +; CHECK-NEXT: stp q5, q4, [x8, #432] +; CHECK-NEXT: stp q2, q3, [x8, #464] ; CHECK-NEXT: str q0, [x8, #496] ; CHECK-NEXT: add sp, sp, #192 ; CHECK-NEXT: .cfi_def_cfa_offset 0 diff --git a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll index 47e33815174994..6b03e5d12bfd38 100644 --- a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll +++ b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll @@ -8,8 +8,8 @@ define hidden @test_load_sve_lane0(ptr nocapture noundef readonly %a, noundef %b) local_unnamed_addr { ; CHECK-LABEL: test_load_sve_lane0: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: ldapr x8, [x0] +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret %1 = load atomic i64, ptr %a acquire, align 8 @@ -20,9 +20,9 @@ define hidden @test_load_sve_lane0(ptr nocapture noundef read define hidden @test_load_sve_lane1(ptr nocapture noundef readonly %a, noundef %b) local_unnamed_addr { ; CHECK-LABEL: test_load_sve_lane1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: ldapr x8, [x0] ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d diff --git a/llvm/test/CodeGen/AArch64/reassocmls.ll b/llvm/test/CodeGen/AArch64/reassocmls.ll index 381caffba92eb0..acbf9fc584a2ea 100644 --- a/llvm/test/CodeGen/AArch64/reassocmls.ll +++ b/llvm/test/CodeGen/AArch64/reassocmls.ll @@ -79,7 +79,7 @@ define i64 @mls_i64_C(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e) { ; CHECK-LABEL: mls_i64_C: ; CHECK: // %bb.0: ; CHECK-NEXT: mul x8, x2, x1 -; CHECK-NEXT: mov w9, #10 +; CHECK-NEXT: mov w9, #10 // =0xa ; CHECK-NEXT: madd x8, x4, x3, x8 ; CHECK-NEXT: sub x0, x9, x8 ; CHECK-NEXT: ret @@ -290,9 +290,9 @@ define @smlsl_nxv8i16( %a, @umlsl_nxv8i16( %a, %b, %c, %d, %e) { ; CHECK-LABEL: umlsl_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z3.h, z3.h, #0xff ; CHECK-NEXT: and z4.h, z4.h, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z2.h, z2.h, #0xff ; CHECK-NEXT: mls z0.h, p0/m, z4.h, z3.h @@ -326,8 +326,8 @@ define @mls_nxv8i16( %a, define @mla_nxv8i16( %a, %b, %c, %d, %e) { ; CHECK-LABEL: mla_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mul z1.h, z2.h, z1.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mla z1.h, p0/m, z4.h, z3.h ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll index a080a7403811fc..325ab444205bf9 100644 --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -20,111 +20,111 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: ldr d6, [x10, x8] ; CHECK-NEXT: ldr d5, [x11] ; CHECK-NEXT: ldr d7, [x11, x9] -; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b ; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v1.8h, v2.8b, v3.8b +; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b ; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b -; CHECK-NEXT: shll2 v4.4s, v2.8h, #16 -; CHECK-NEXT: shll2 v5.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 ; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 -; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 -; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h -; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h +; CHECK-NEXT: shll2 v7.4s, v2.8h, #16 +; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h ; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h -; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h +; CHECK-NEXT: saddw v2.4s, v7.4s, v2.4h +; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s +; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s ; CHECK-NEXT: mov v7.16b, v2.16b -; CHECK-NEXT: zip1 v4.4s, v2.4s, v0.4s -; CHECK-NEXT: zip2 v6.4s, v2.4s, v0.4s -; CHECK-NEXT: uzp2 v5.4s, v3.4s, v1.4s -; CHECK-NEXT: mov v17.16b, v1.16b -; CHECK-NEXT: zip2 v16.4s, v1.4s, v3.4s -; CHECK-NEXT: mov v7.s[3], v0.s[2] -; CHECK-NEXT: ext v18.16b, v3.16b, v3.16b, #12 -; CHECK-NEXT: ext v2.16b, v2.16b, v4.16b, #8 -; CHECK-NEXT: mov v17.s[1], v3.s[0] +; CHECK-NEXT: ext v17.16b, v3.16b, v3.16b, #12 +; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s +; CHECK-NEXT: ext v16.16b, v1.16b, v4.16b, #8 +; CHECK-NEXT: mov v1.s[3], v0.s[2] +; CHECK-NEXT: mov v7.s[1], v3.s[0] ; CHECK-NEXT: uzp2 v0.4s, v5.4s, v3.4s -; CHECK-NEXT: zip2 v5.4s, v3.4s, v1.4s -; CHECK-NEXT: mov v3.s[0], v1.s[1] -; CHECK-NEXT: ext v1.16b, v1.16b, v18.16b, #12 -; CHECK-NEXT: mov v16.d[1], v7.d[1] -; CHECK-NEXT: mov v17.d[1], v2.d[1] +; CHECK-NEXT: zip2 v5.4s, v2.4s, v3.4s +; CHECK-NEXT: mov v3.s[0], v2.s[1] +; CHECK-NEXT: ext v2.16b, v2.16b, v17.16b, #12 +; CHECK-NEXT: mov v18.d[1], v1.d[1] +; CHECK-NEXT: mov v7.d[1], v16.d[1] ; CHECK-NEXT: mov v0.d[1], v6.d[1] -; CHECK-NEXT: mov v5.d[1], v7.d[1] ; CHECK-NEXT: mov v3.d[1], v4.d[1] -; CHECK-NEXT: mov v1.d[1], v6.d[1] -; CHECK-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-NEXT: add v2.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v3.4s, v17.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v16.4s, v1.4s +; CHECK-NEXT: mov v5.d[1], v1.d[1] +; CHECK-NEXT: mov v2.d[1], v6.d[1] +; CHECK-NEXT: add v0.4s, v0.4s, v18.4s +; CHECK-NEXT: add v1.4s, v3.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v3.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v2.4s ; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: add v6.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s +; CHECK-NEXT: sub v5.4s, v3.4s, v2.4s +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s ; CHECK-NEXT: mov v4.d[1], v0.d[1] -; CHECK-NEXT: mov v5.d[1], v2.d[1] -; CHECK-NEXT: rev64 v3.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s -; CHECK-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-NEXT: rev64 v4.4s, v6.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: rev64 v7.4s, v0.4s -; CHECK-NEXT: addp v16.4s, v0.4s, v6.4s -; CHECK-NEXT: addp v17.4s, v2.4s, v1.4s -; CHECK-NEXT: sub v4.4s, v6.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s -; CHECK-NEXT: zip1 v21.4s, v16.4s, v16.4s -; CHECK-NEXT: ext v5.16b, v17.16b, v1.16b, #4 -; CHECK-NEXT: ext v6.16b, v16.16b, v4.16b, #4 -; CHECK-NEXT: mov v18.16b, v1.16b -; CHECK-NEXT: mov v19.16b, v4.16b -; CHECK-NEXT: ext v3.16b, v2.16b, v17.16b, #8 -; CHECK-NEXT: ext v7.16b, v0.16b, v16.16b, #4 -; CHECK-NEXT: mov v18.s[2], v17.s[3] -; CHECK-NEXT: zip2 v5.4s, v5.4s, v17.4s -; CHECK-NEXT: zip2 v6.4s, v6.4s, v16.4s -; CHECK-NEXT: mov v19.s[2], v16.s[3] -; CHECK-NEXT: trn2 v0.4s, v21.4s, v0.4s -; CHECK-NEXT: ext v20.16b, v3.16b, v2.16b, #4 -; CHECK-NEXT: ext v7.16b, v7.16b, v7.16b, #4 -; CHECK-NEXT: mov v2.s[2], v17.s[1] -; CHECK-NEXT: ext v1.16b, v1.16b, v5.16b, #12 -; CHECK-NEXT: ext v4.16b, v4.16b, v6.16b, #12 -; CHECK-NEXT: mov v5.16b, v18.16b -; CHECK-NEXT: uzp2 v3.4s, v3.4s, v20.4s -; CHECK-NEXT: mov v6.16b, v7.16b -; CHECK-NEXT: mov v20.16b, v19.16b -; CHECK-NEXT: mov v21.16b, v2.16b -; CHECK-NEXT: mov v5.s[1], v17.s[2] -; CHECK-NEXT: sub v7.4s, v0.4s, v7.4s -; CHECK-NEXT: mov v6.s[0], v16.s[1] -; CHECK-NEXT: mov v20.s[1], v16.s[2] -; CHECK-NEXT: sub v16.4s, v19.4s, v4.4s -; CHECK-NEXT: mov v21.s[1], v17.s[0] -; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s -; CHECK-NEXT: sub v17.4s, v18.4s, v1.4s -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: mov v6.d[1], v1.d[1] +; CHECK-NEXT: rev64 v3.4s, v5.4s +; CHECK-NEXT: rev64 v7.4s, v2.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s ; CHECK-NEXT: add v0.4s, v0.4s, v6.4s -; CHECK-NEXT: add v4.4s, v20.4s, v4.4s -; CHECK-NEXT: add v3.4s, v21.4s, v3.4s -; CHECK-NEXT: mov v1.d[1], v17.d[1] -; CHECK-NEXT: mov v0.d[1], v7.d[1] -; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v3.d[1], v2.d[1] +; CHECK-NEXT: sub v3.4s, v5.4s, v3.4s +; CHECK-NEXT: addp v4.4s, v1.4s, v5.4s +; CHECK-NEXT: sub v5.4s, v2.4s, v7.4s +; CHECK-NEXT: addp v2.4s, v0.4s, v2.4s +; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: rev64 v7.4s, v1.4s +; CHECK-NEXT: ext v16.16b, v4.16b, v3.16b, #4 +; CHECK-NEXT: ext v17.16b, v2.16b, v5.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v7.4s +; CHECK-NEXT: mov v7.16b, v3.16b +; CHECK-NEXT: zip2 v6.4s, v16.4s, v4.4s +; CHECK-NEXT: mov v16.16b, v5.16b +; CHECK-NEXT: zip2 v17.4s, v17.4s, v2.4s +; CHECK-NEXT: ext v18.16b, v0.16b, v2.16b, #4 +; CHECK-NEXT: mov v7.s[2], v4.s[3] +; CHECK-NEXT: mov v21.16b, v1.16b +; CHECK-NEXT: mov v16.s[2], v2.s[3] +; CHECK-NEXT: ext v5.16b, v5.16b, v17.16b, #12 +; CHECK-NEXT: zip1 v17.4s, v2.4s, v2.4s +; CHECK-NEXT: ext v3.16b, v3.16b, v6.16b, #12 +; CHECK-NEXT: ext v18.16b, v18.16b, v18.16b, #4 +; CHECK-NEXT: mov v19.16b, v7.16b +; CHECK-NEXT: ext v6.16b, v1.16b, v4.16b, #8 +; CHECK-NEXT: mov v21.s[2], v4.s[1] +; CHECK-NEXT: mov v20.16b, v16.16b +; CHECK-NEXT: mov v19.s[1], v4.s[2] +; CHECK-NEXT: trn2 v0.4s, v17.4s, v0.4s +; CHECK-NEXT: sub v16.4s, v16.4s, v5.4s +; CHECK-NEXT: mov v17.16b, v18.16b +; CHECK-NEXT: ext v1.16b, v6.16b, v1.16b, #4 +; CHECK-NEXT: sub v7.4s, v7.4s, v3.4s +; CHECK-NEXT: mov v20.s[1], v2.s[2] +; CHECK-NEXT: mov v17.s[0], v2.s[1] +; CHECK-NEXT: mov v2.16b, v21.16b +; CHECK-NEXT: add v3.4s, v19.4s, v3.4s +; CHECK-NEXT: uzp2 v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v5.4s, v20.4s, v5.4s +; CHECK-NEXT: mov v2.s[1], v4.s[0] +; CHECK-NEXT: sub v4.4s, v0.4s, v18.4s +; CHECK-NEXT: mov v3.d[1], v7.d[1] +; CHECK-NEXT: add v0.4s, v0.4s, v17.4s +; CHECK-NEXT: mov v5.d[1], v16.d[1] +; CHECK-NEXT: sub v6.4s, v21.4s, v1.4s +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: mov v0.d[1], v4.d[1] +; CHECK-NEXT: cmlt v4.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v2.8h, v5.8h, #0 +; CHECK-NEXT: mov v1.d[1], v6.d[1] +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: cmlt v6.8h, v0.8h, #0 +; CHECK-NEXT: add v5.4s, v2.4s, v5.4s +; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b ; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 -; CHECK-NEXT: cmlt v2.8h, v0.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v4.8h, #0 -; CHECK-NEXT: cmlt v5.8h, v3.8h, #0 +; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: eor v2.16b, v5.16b, v2.16b ; CHECK-NEXT: add v1.4s, v7.4s, v1.4s -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v4.4s, v6.4s, v4.4s -; CHECK-NEXT: add v3.4s, v5.4s, v3.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s ; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v2.16b, v3.16b, v5.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v6.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s @@ -278,13 +278,13 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: mov v1.d[1], v6.d[1] ; CHECK-NEXT: add v2.4s, v2.4s, v16.4s ; CHECK-NEXT: add v3.4s, v4.4s, v17.4s +; CHECK-NEXT: rev64 v5.4s, v2.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: sub v1.4s, v17.4s, v4.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s ; CHECK-NEXT: rev64 v6.4s, v3.4s +; CHECK-NEXT: mov v5.d[1], v2.d[1] ; CHECK-NEXT: sub v4.4s, v1.4s, v0.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v5.d[1], v2.d[1] ; CHECK-NEXT: mov v6.d[1], v3.d[1] ; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s ; CHECK-NEXT: add v1.4s, v2.4s, v6.4s @@ -304,40 +304,40 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: mov v4.d[1], v16.d[1] ; CHECK-NEXT: mov v1.d[1], v7.d[1] ; CHECK-NEXT: add v0.4s, v17.4s, v1.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s ; CHECK-NEXT: add v2.4s, v18.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s ; CHECK-NEXT: sub v3.4s, v4.4s, v18.4s -; CHECK-NEXT: zip2 v4.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: ext v6.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v0.4s, v1.4s ; CHECK-NEXT: zip2 v7.4s, v1.4s, v0.4s ; CHECK-NEXT: zip2 v16.4s, v3.4s, v2.4s ; CHECK-NEXT: zip2 v17.4s, v2.4s, v3.4s ; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: zip1 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v5.16b, v1.16b, #8 -; CHECK-NEXT: ext v18.16b, v6.16b, v3.16b, #8 -; CHECK-NEXT: add v3.4s, v16.4s, v7.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ext v1.16b, v1.16b, v5.16b, #4 -; CHECK-NEXT: ext v5.16b, v18.16b, v6.16b, #4 -; CHECK-NEXT: cmlt v2.8h, v4.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v4.4s, v2.4s, v4.4s -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: ext v18.16b, v4.16b, v1.16b, #8 +; CHECK-NEXT: ext v19.16b, v5.16b, v3.16b, #8 +; CHECK-NEXT: zip1 v1.4s, v2.4s, v3.4s +; CHECK-NEXT: add v2.4s, v16.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v6.4s, v17.4s +; CHECK-NEXT: ext v4.16b, v18.16b, v4.16b, #4 +; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 +; CHECK-NEXT: add v4.4s, v5.4s, v4.4s ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: add v2.4s, v6.4s, v2.4s +; CHECK-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: cmlt v4.8h, v1.8h, #0 -; CHECK-NEXT: add v2.4s, v3.4s, v2.4s -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: add v3.4s, v7.4s, v4.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: eor v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: eor v2.16b, v3.16b, v7.16b +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 @@ -461,93 +461,93 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: ldr d3, [x11, x9] ; CHECK-NEXT: ldr d4, [x10] ; CHECK-NEXT: ldr d5, [x11] +; CHECK-NEXT: shll2 v6.4s, v0.8h, #16 ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b -; CHECK-NEXT: usubl v3.8h, v4.8b, v5.8b -; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 -; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 -; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h -; CHECK-NEXT: shll2 v4.4s, v2.8h, #16 -; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: shll2 v5.4s, v3.8h, #16 -; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h -; CHECK-NEXT: saddw v3.4s, v5.4s, v3.4h -; CHECK-NEXT: rev64 v4.4s, v0.4s +; CHECK-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b +; CHECK-NEXT: saddw v0.4s, v6.4s, v0.4h +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: saddw v1.4s, v3.4s, v1.4h +; CHECK-NEXT: shll2 v3.4s, v4.8h, #16 +; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h ; CHECK-NEXT: rev64 v5.4s, v1.4s -; CHECK-NEXT: rev64 v6.4s, v2.4s -; CHECK-NEXT: rev64 v7.4s, v3.4s -; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s +; CHECK-NEXT: saddw v3.4s, v3.4s, v4.4h +; CHECK-NEXT: rev64 v4.4s, v2.4s +; CHECK-NEXT: sub v6.4s, v0.4s, v6.4s ; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s +; CHECK-NEXT: rev64 v7.4s, v3.4s ; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s +; CHECK-NEXT: sub v4.4s, v2.4s, v4.4s ; CHECK-NEXT: addp v2.4s, v2.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v7.4s +; CHECK-NEXT: ext v1.16b, v5.16b, v6.16b, #4 +; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v7.16b, v5.16b, v4.16b, #4 -; CHECK-NEXT: mov v4.s[3], v5.s[2] -; CHECK-NEXT: zip2 v16.4s, v6.4s, v1.4s -; CHECK-NEXT: zip1 v1.4s, v6.4s, v1.4s -; CHECK-NEXT: uzp2 v6.4s, v2.4s, v0.4s -; CHECK-NEXT: ext v5.16b, v7.16b, v5.16b, #4 +; CHECK-NEXT: mov v6.s[3], v5.s[2] +; CHECK-NEXT: zip2 v16.4s, v4.4s, v7.4s +; CHECK-NEXT: zip1 v4.4s, v4.4s, v7.4s +; CHECK-NEXT: ext v1.16b, v1.16b, v5.16b, #4 +; CHECK-NEXT: uzp2 v5.4s, v2.4s, v0.4s ; CHECK-NEXT: uzp1 v0.4s, v2.4s, v0.4s ; CHECK-NEXT: uzp1 v7.4s, v2.4s, v3.4s ; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: mov v16.d[1], v4.d[1] -; CHECK-NEXT: rev64 v3.4s, v6.4s -; CHECK-NEXT: mov v1.d[1], v5.d[1] +; CHECK-NEXT: mov v16.d[1], v6.d[1] +; CHECK-NEXT: mov v4.d[1], v1.d[1] +; CHECK-NEXT: rev64 v1.4s, v5.4s ; CHECK-NEXT: rev64 v0.4s, v0.4s ; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s -; CHECK-NEXT: sub v4.4s, v1.4s, v16.4s -; CHECK-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-NEXT: add v1.4s, v16.4s, v1.4s -; CHECK-NEXT: zip1 v3.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v4.4s, v16.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v16.4s, v4.4s +; CHECK-NEXT: zip1 v4.4s, v2.4s, v3.4s ; CHECK-NEXT: zip1 v5.4s, v0.4s, v1.4s ; CHECK-NEXT: uzp2 v6.4s, v0.4s, v1.4s ; CHECK-NEXT: zip2 v7.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v17.4s, v2.4s, v4.4s -; CHECK-NEXT: ext v16.16b, v2.16b, v3.16b, #8 +; CHECK-NEXT: zip2 v17.4s, v2.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v4.16b, #8 ; CHECK-NEXT: trn2 v5.4s, v0.4s, v5.4s ; CHECK-NEXT: uzp2 v6.4s, v6.4s, v0.4s -; CHECK-NEXT: mov v2.s[3], v4.s[2] +; CHECK-NEXT: mov v2.s[3], v3.s[2] ; CHECK-NEXT: mov v0.s[1], v1.s[1] ; CHECK-NEXT: mov v5.d[1], v16.d[1] ; CHECK-NEXT: mov v6.d[1], v17.d[1] ; CHECK-NEXT: mov v7.d[1], v2.d[1] -; CHECK-NEXT: mov v0.d[1], v3.d[1] +; CHECK-NEXT: mov v0.d[1], v4.d[1] ; CHECK-NEXT: add v1.4s, v6.4s, v7.4s -; CHECK-NEXT: sub v2.4s, v7.4s, v6.4s -; CHECK-NEXT: add v3.4s, v5.4s, v0.4s +; CHECK-NEXT: add v2.4s, v5.4s, v0.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v6.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: zip2 v4.4s, v1.4s, v2.4s -; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #4 -; CHECK-NEXT: ext v6.16b, v3.16b, v3.16b, #4 -; CHECK-NEXT: zip2 v7.4s, v2.4s, v1.4s -; CHECK-NEXT: zip2 v16.4s, v0.4s, v3.4s -; CHECK-NEXT: zip2 v17.4s, v3.4s, v0.4s -; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ext v2.16b, v5.16b, v2.16b, #8 -; CHECK-NEXT: ext v18.16b, v6.16b, v0.16b, #8 -; CHECK-NEXT: zip1 v0.4s, v3.4s, v0.4s -; CHECK-NEXT: add v3.4s, v16.4s, v7.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s -; CHECK-NEXT: ext v2.16b, v2.16b, v5.16b, #4 -; CHECK-NEXT: ext v5.16b, v18.16b, v6.16b, #4 +; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v7.4s, v3.4s, v1.4s +; CHECK-NEXT: zip2 v16.4s, v0.4s, v2.4s +; CHECK-NEXT: zip2 v17.4s, v2.4s, v0.4s +; CHECK-NEXT: zip1 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ext v18.16b, v4.16b, v3.16b, #8 +; CHECK-NEXT: ext v19.16b, v5.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v2.4s, v16.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v6.4s, v17.4s +; CHECK-NEXT: ext v4.16b, v18.16b, v4.16b, #4 +; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #4 ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s -; CHECK-NEXT: cmlt v1.8h, v4.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v4.4s, v1.4s, v4.4s -; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 +; CHECK-NEXT: add v4.4s, v5.4s, v4.4s ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: add v2.4s, v6.4s, v2.4s +; CHECK-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: add v2.4s, v4.4s, v2.4s +; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: add v3.4s, v7.4s, v4.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: eor v2.16b, v3.16b, v7.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: eor v1.16b, v2.16b, v4.16b -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll index 86c224bee990ad..2deb19be24821b 100644 --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -346,9 +346,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_min(<16 x i8> %x) { ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_min: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.16b, #213 +; CHECK-NEXT: movi v2.16b, #42 ; CHECK-NEXT: umin v0.16b, v0.16b, v1.16b -; CHECK-NEXT: movi v1.16b, #42 -; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: add v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %c = icmp ult <16 x i8> %x, %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> @@ -384,9 +384,9 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_min(<8 x i16> %x) { ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_min: ; CHECK: // %bb.0: ; CHECK-NEXT: mvni v1.8h, #42 +; CHECK-NEXT: movi v2.8h, #42 ; CHECK-NEXT: umin v0.8h, v0.8h, v1.8h -; CHECK-NEXT: movi v1.8h, #42 -; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: add v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret %c = icmp ult <8 x i16> %x, %s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index 3e0d5dd875097f..5237a3491de9b4 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -245,15 +245,15 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) { ; CHECK-GI-LABEL: sext_v3i8_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, #24 // =0x18 -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: mov v1.s[1], w1 -; CHECK-GI-NEXT: mov v0.s[1], w8 -; CHECK-GI-NEXT: mov v1.s[2], w2 -; CHECK-GI-NEXT: mov v0.s[2], w8 -; CHECK-GI-NEXT: neg v2.4s, v0.4s -; CHECK-GI-NEXT: ushl v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v0.s[2], w2 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i8> %a to <3 x i32> @@ -408,15 +408,15 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) { ; CHECK-GI-LABEL: sext_v3i10_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, #22 // =0x16 -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: mov v1.s[1], w1 -; CHECK-GI-NEXT: mov v0.s[1], w8 -; CHECK-GI-NEXT: mov v1.s[2], w2 -; CHECK-GI-NEXT: mov v0.s[2], w8 -; CHECK-GI-NEXT: neg v2.4s, v0.4s -; CHECK-GI-NEXT: ushl v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v0.s[2], w2 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i10> %a to <3 x i32> diff --git a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll index f4f75bb9c7825f..88e062d2c999c1 100644 --- a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll +++ b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll @@ -158,8 +158,8 @@ define i32 @sink_sub_from_const_to_sub2(i32 %a, i32 %b) { define <4 x i32> @vec_sink_add_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_add_of_const_to_add0: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -170,8 +170,8 @@ define <4 x i32> @vec_sink_add_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_add_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_add_of_const_to_add1: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -186,8 +186,8 @@ define <4 x i32> @vec_sink_add_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_of_const_to_add0: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -198,8 +198,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_of_const_to_add1: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI15_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -214,8 +214,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_from_const_to_add0(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_from_const_to_add0: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -226,8 +226,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_add0(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_from_const_to_add1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_from_const_to_add1: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -242,8 +242,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_add1(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_add_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_add_of_const_to_sub: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -254,8 +254,8 @@ define <4 x i32> @vec_sink_add_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_add_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_add_of_const_to_sub2: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -270,8 +270,8 @@ define <4 x i32> @vec_sink_add_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_of_const_to_sub: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -282,8 +282,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_of_const_to_sub2: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: adrp x8, .LCPI21_0 +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -298,8 +298,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_from_const_to_sub: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI22_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret @@ -310,8 +310,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_from_const_to_sub2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_from_const_to_sub2: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index 0c674c5685e000..1d1bae42c9e300 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -28,8 +28,8 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload @@ -61,8 +61,8 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload @@ -94,8 +94,8 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload @@ -127,8 +127,8 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload @@ -165,8 +165,8 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -205,8 +205,8 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -245,8 +245,8 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -290,8 +290,8 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -331,8 +331,8 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -372,8 +372,8 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -413,8 +413,8 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -454,8 +454,8 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -495,8 +495,8 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -536,8 +536,8 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -581,8 +581,8 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -621,8 +621,8 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -661,8 +661,8 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -701,8 +701,8 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -741,8 +741,8 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -781,8 +781,8 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8bf16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -821,8 +821,8 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -861,8 +861,8 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -894,9 +894,9 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -942,13 +942,13 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i8 @get_i8() @@ -969,13 +969,13 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i16 @get_i16() @@ -996,13 +996,13 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i32 @get_i32() @@ -1023,13 +1023,13 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i64 @get_i64() @@ -1056,11 +1056,11 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call half @get_f16() @@ -1086,11 +1086,11 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call float @get_f32() @@ -1116,11 +1116,11 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call double @get_f64() @@ -1150,11 +1150,11 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i8> @get_v1i8() @@ -1181,11 +1181,11 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i16> @get_v1i16() @@ -1212,11 +1212,11 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i32> @get_v1i32() @@ -1243,11 +1243,11 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i64> @get_v1i64() @@ -1275,11 +1275,11 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x half> @get_v1f16() @@ -1306,11 +1306,11 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x float> @get_v1f32() @@ -1337,11 +1337,11 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x double> @get_v1f64() @@ -1373,11 +1373,11 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <16 x i8> @get_v16i8() @@ -1404,11 +1404,11 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <8 x i16> @get_v8i16() @@ -1435,11 +1435,11 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <4 x i32> @get_v4i32() @@ -1466,11 +1466,11 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <2 x i64> @get_v2i64() @@ -1497,11 +1497,11 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <8 x half> @get_v8f16() @@ -1528,11 +1528,11 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <4 x float> @get_v4f32() @@ -1559,11 +1559,11 @@ define void @dont_coalesce_res_v2f64(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <2 x double> @get_v2f64() diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 47b24290d3c85f..1e16f140676bac 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -151,8 +151,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB4_4: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll index 0097968b1171d7..b4fd5a2272e7ea 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll @@ -26,16 +26,16 @@ define void @fdot_multi_za32_f16_vg1x2(i32 %slice, %unused, < define void @fdot_multi_za32_f16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: fdot_multi_za32_f16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: fdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: fdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -71,16 +71,16 @@ define void @bfdot_multi_za32_bf16_vg1x2(i32 %slice, %unused, define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: fdot_multi_za32_bf16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: bfdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: bfdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index 6d986048371157..e154a4df86efe1 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -26,16 +26,16 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, %unused, < define void @udot_multi_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -68,16 +68,16 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, %unused, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -110,16 +110,16 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, %unused, < define void @udot_multi_za64_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za64_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -152,16 +152,16 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, %unused, < define void @usdot_multi_za32_u8_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: usdot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -197,16 +197,16 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, %unused, < define void @sdot_multi_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -239,16 +239,16 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, %unused, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -281,16 +281,16 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, %unused, < define void @sdot_multi_za64_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za64_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll index e95d29f65e55e7..92e8877927ea57 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s ; SMAX (Single, x2) @@ -151,8 +152,7 @@ define { , } @multi_vec_max_single_x2 ; SMAX (Single, x4) -define { , , , } -@multi_vec_max_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_s8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -170,8 +170,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_s16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -189,8 +188,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_s32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -208,8 +206,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_s64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -229,8 +226,7 @@ define { , , , , , , } -@multi_vec_max_single_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_u8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -248,8 +244,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_u16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -267,8 +262,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_u32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -286,8 +280,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_u64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -307,8 +300,7 @@ define { , , , , , , } -@multi_vec_max_single_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -326,8 +318,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -345,8 +336,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -537,104 +527,100 @@ define { , } @multi_vec_max_multi_x2_ ; SMAX (Multi, x4) -define { , , , } -@multi_vec_max_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_s8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smax { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smax.x4.nxv16i8( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_s16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smax.x4.nxv8i16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_s32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smax.x4.nxv4i32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_s64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smax.x4.nxv2i64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -643,104 +629,100 @@ define { , , , , , , } -@multi_vec_max_multi_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_u8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umax { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umax.x4.nxv16i8( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_u16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umax.x4.nxv8i16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_u32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umax.x4.nxv4i32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_u64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umax.x4.nxv2i64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -749,78 +731,75 @@ define { , , , , , , } -@multi_vec_max_multi_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_f16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmax.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_f32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmax.x4.nxv4f32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_f64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmax.x4.nxv2f64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -870,8 +849,7 @@ define { , } @multi_vec_maxnm_single ; FMAXNM (Single, x4) -define { , , , } -@multi_vec_maxnm_single_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_maxnm_single_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_maxnm_single_x4_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -889,8 +867,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_maxnm_single_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_maxnm_single_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_maxnm_single_x4_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -908,8 +885,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_maxnm_single_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_maxnm_single_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_maxnm_single_x4_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -976,19 +952,18 @@ define { , } @multi_vec_maxnm_x2_f64( ; FMAXNM (Multi, x4) -define { , , , } -@multi_vec_maxnm_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_maxnm_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_maxnm_x4_f16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmaxnm { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -996,24 +971,23 @@ define { , , , , , , } - @llvm.aarch64.sve.fmaxnm.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, + @llvm.aarch64.sve.fmaxnm.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_maxnm_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_maxnm_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_maxnm_x4_f32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmaxnm { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -1026,19 +1000,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_maxnm_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_maxnm_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_maxnm_x4_f64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmaxnm { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll index 21a55c6436acd8..363f9ba5d3530e 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s ; SMIN (Single, x2) @@ -151,8 +152,7 @@ define { , } @multi_vec_min_single_x2 ; SMIN (Single, x4) -define { , , , } -@multi_vec_min_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_s8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -170,8 +170,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_s16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -189,8 +188,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_s32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -208,8 +206,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_s64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -229,8 +226,7 @@ define { , , , , , , } -@multi_vec_min_single_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_u8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -248,8 +244,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_u16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -267,8 +262,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_u32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -286,8 +280,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_u64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -307,8 +300,7 @@ define { , , , , , , } -@multi_vec_min_single_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -326,8 +318,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -345,8 +336,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -537,104 +527,100 @@ define { , } @multi_vec_min_multi_x2_ ; SMIN (Multi, x4) -define { , , , } -@multi_vec_min_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_s8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smin { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smin.x4.nxv16i8( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_s16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smin.x4.nxv8i16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_s32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smin.x4.nxv4i32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_s64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smin.x4.nxv2i64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -643,104 +629,100 @@ define { , , , , , , } -@multi_vec_min_multi_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_u8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umin { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umin.x4.nxv16i8( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_u16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umin.x4.nxv8i16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_u32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umin.x4.nxv4i32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_u64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umin.x4.nxv2i64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -749,78 +731,75 @@ define { , , , , , , } -@multi_vec_min_multi_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmin.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmin.x4.nxv4f32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmin.x4.nxv2f64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -870,8 +849,7 @@ define { , } @multi_vec_minnm_single ; FMINNM (Single, x4) -define { , , , } -@multi_vec_minnm_single_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_minnm_single_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_minnm_single_x4_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -889,8 +867,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_minnm_single_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_minnm_single_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_minnm_single_x4_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -908,8 +885,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_minnm_single_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_minnm_single_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_minnm_single_x4_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -976,19 +952,18 @@ define { , } @multi_vec_minnm_x2_f64( ; FMINNM (Multi, x4) -define { , , , } -@multi_vec_minnm_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_minnm_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_minnm_x4_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fminnm { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -1001,19 +976,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_minnm_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_minnm_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_minnm_x4_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fminnm { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -1026,19 +1000,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_minnm_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_minnm_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_minnm_x4_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fminnm { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll index f766bfcff4d1d1..346afc611eb756 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll @@ -142,16 +142,16 @@ define void @multi_vector_mul_add_multi_long_vg4x2_s16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: smlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: smlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -164,16 +164,16 @@ define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: smlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: smlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -418,16 +418,16 @@ define void @multi_vector_mul_add_multi_long_vg4x2_u16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: umlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: umlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -440,16 +440,16 @@ define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: umlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: umlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -694,16 +694,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x2_s16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -716,16 +716,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -970,16 +970,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x2_u16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -992,16 +992,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -1275,16 +1275,16 @@ define void @multi_vector_mul_add_multi_unsigned_long_vg4x2_u8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x4_u8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll index d138a3af438524..12a940ff03e29a 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s ; SRSHL (Single, x2) @@ -56,8 +57,7 @@ define { , } @multi_vec_rounding_shl_single ; SRSHL (Single, x4) -define { , , , } -@multi_vec_rounding_shl_single_x4_s8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_s8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -75,8 +75,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_s16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_s16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -94,8 +93,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_s32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_s32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -113,8 +111,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_s64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_s64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -188,8 +185,7 @@ define { , } @multi_vec_rounding_shl_single ; URSHL (Single, x4) -define { , , , } -@multi_vec_rounding_shl_single_x4_u8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_u8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -207,8 +203,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_u16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_u16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -226,8 +221,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_u32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_u32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -245,8 +239,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_u64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_u64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -328,19 +321,18 @@ define { , } @multi_vec_rounding_shl_x2_s64 ; SRSHL (Multi, x4) -define { , , , } -@multi_vec_rounding_shl_x4_s8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_s8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_s8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: srshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -353,19 +345,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_s16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_s16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_s16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: srshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -378,19 +369,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_s32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_s32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_s32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: srshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -403,19 +393,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_s64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_s64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_s64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: srshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -492,19 +481,18 @@ define { , } @multi_vec_rounding_uhl_x2_u64 ; URSHL (Multi, x4) -define { , , , } -@multi_vec_rounding_shl_x4_u8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_u8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_u8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: urshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -517,19 +505,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_u16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_u16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_u16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: urshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -542,19 +529,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_u32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_u32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_u32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: urshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -567,19 +553,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_u64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_u64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_u64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: urshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll index 9c5dff6c3bf6fb..e71afe213d8a59 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s ; SQDMULH (Single, x2) @@ -56,8 +57,7 @@ define { , } @multi_vec_sat_double_mulh_sin ; SQDMULH (Single, x4) -define { , , , } -@multi_vec_sat_double_mulh_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_sat_double_mulh_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -75,8 +75,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_sat_double_mulh_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -94,8 +93,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_sat_double_mulh_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -113,8 +111,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_sat_double_mulh_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -196,104 +193,100 @@ define { , } @multi_vec_sat_double_mulh_mul ; SQDMULH (x4, Multi) -define { , , , } -@multi_vec_sat_double_mulh_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_sat_double_mulh_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: sqdmulh { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv16i8( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_sat_double_mulh_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: sqdmulh { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv8i16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_sat_double_mulh_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: sqdmulh { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv4i32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_sat_double_mulh_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: sqdmulh { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv2i64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) diff --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll index a507296338f939..b8ab9a00c69810 100644 --- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll +++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc < %s -debug-only=legalize-types 2>&1 | FileCheck %s --check-prefix=CHECK-LEGALIZATION ; RUN: llc < %s | FileCheck %s ; REQUIRES: asserts @@ -9,54 +10,94 @@ declare @llvm.vector.insert.nxv2i64.v8i64(, declare @llvm.vector.insert.nxv2f64.v8f64(, <8 x double>, i64) define @test_nxv2i64_v8i64( %a, <8 x i64> %b) #0 { -; CHECK-LEGALIZATION: Legally typed node: [[T1:t[0-9]+]]: nxv2i64 = insert_subvector {{t[0-9]+}}, {{t[0-9]+}}, Constant:i64<0> -; CHECK-LEGALIZATION: Legally typed node: [[T2:t[0-9]+]]: nxv2i64 = insert_subvector [[T1]], {{t[0-9]+}}, Constant:i64<2> -; CHECK-LEGALIZATION: Legally typed node: [[T3:t[0-9]+]]: nxv2i64 = insert_subvector [[T2]], {{t[0-9]+}}, Constant:i64<4> -; CHECK-LEGALIZATION: Legally typed node: [[T4:t[0-9]+]]: nxv2i64 = insert_subvector [[T3]], {{t[0-9]+}}, Constant:i64<6> - +; CHECK-LEGALIZATION-LABEL: test_nxv2i64_v8i64: +; CHECK-LEGALIZATION: // %bb.0: +; CHECK-LEGALIZATION-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa_offset 16 +; CHECK-LEGALIZATION-NEXT: .cfi_offset w29, -16 +; CHECK-LEGALIZATION-NEXT: addvl sp, sp, #-3 +; CHECK-LEGALIZATION-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-LEGALIZATION-NEXT: cntd x8 +; CHECK-LEGALIZATION-NEXT: ptrue p0.d, vl2 +; CHECK-LEGALIZATION-NEXT: mov w9, #2 // =0x2 +; CHECK-LEGALIZATION-NEXT: sub x8, x8, #2 +; CHECK-LEGALIZATION-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-LEGALIZATION-NEXT: mov x10, sp +; CHECK-LEGALIZATION-NEXT: cmp x8, #2 +; CHECK-LEGALIZATION-NEXT: mov z0.d, p0/m, z1.d +; CHECK-LEGALIZATION-NEXT: ptrue p0.d +; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: cmp x8, #4 +; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-LEGALIZATION-NEXT: str q2, [x10, x9] +; CHECK-LEGALIZATION-NEXT: mov w9, #4 // =0x4 +; CHECK-LEGALIZATION-NEXT: addvl x10, sp, #1 +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: cmp x8, #6 +; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: str q3, [x10, x9] +; CHECK-LEGALIZATION-NEXT: mov w9, #6 // =0x6 +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: csel x8, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: addvl x9, sp, #2 +; CHECK-LEGALIZATION-NEXT: lsl x8, x8, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: str q4, [x9, x8] +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: addvl sp, sp, #3 +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-LEGALIZATION-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa_offset 0 +; CHECK-LEGALIZATION-NEXT: .cfi_restore w29 +; CHECK-LEGALIZATION-NEXT: ret +; ; CHECK-LABEL: test_nxv2i64_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG -; CHECK-NEXT: ptrue p1.d, vl2 -; CHECK-NEXT: cntd x8 -; CHECK-NEXT: mov w9, #2 // =0x2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub x8, x8, #2 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: mov z0.d, p1/m, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q2, [x10, x9] -; CHECK-NEXT: mov w9, #4 // =0x4 -; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: cmp x8, #6 -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str q3, [x10, x9] -; CHECK-NEXT: mov w9, #6 // =0x6 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: addvl x9, sp, #2 -; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: str q4, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: addvl sp, sp, #3 -; CHECK-NEXT: .cfi_def_cfa wsp, 16 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: ret +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov w9, #2 // =0x2 +; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str q2, [x10, x9] +; CHECK-NEXT: mov w9, #4 // =0x4 +; CHECK-NEXT: addvl x10, sp, #1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: cmp x8, #6 +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str q3, [x10, x9] +; CHECK-NEXT: mov w9, #6 // =0x6 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: addvl x9, sp, #2 +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str q4, [x9, x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + @@ -66,54 +107,94 @@ define @test_nxv2i64_v8i64( %a, <8 x i64> % } define @test_nxv2f64_v8f64( %a, <8 x double> %b) #0 { -; CHECK-LEGALIZATION: Legally typed node: [[T1:t[0-9]+]]: nxv2f64 = insert_subvector {{t[0-9]+}}, {{t[0-9]+}}, Constant:i64<0> -; CHECK-LEGALIZATION: Legally typed node: [[T2:t[0-9]+]]: nxv2f64 = insert_subvector [[T1]], {{t[0-9]+}}, Constant:i64<2> -; CHECK-LEGALIZATION: Legally typed node: [[T3:t[0-9]+]]: nxv2f64 = insert_subvector [[T2]], {{t[0-9]+}}, Constant:i64<4> -; CHECK-LEGALIZATION: Legally typed node: [[T4:t[0-9]+]]: nxv2f64 = insert_subvector [[T3]], {{t[0-9]+}}, Constant:i64<6> - +; CHECK-LEGALIZATION-LABEL: test_nxv2f64_v8f64: +; CHECK-LEGALIZATION: // %bb.0: +; CHECK-LEGALIZATION-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa_offset 16 +; CHECK-LEGALIZATION-NEXT: .cfi_offset w29, -16 +; CHECK-LEGALIZATION-NEXT: addvl sp, sp, #-3 +; CHECK-LEGALIZATION-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-LEGALIZATION-NEXT: cntd x8 +; CHECK-LEGALIZATION-NEXT: ptrue p0.d, vl2 +; CHECK-LEGALIZATION-NEXT: mov w9, #2 // =0x2 +; CHECK-LEGALIZATION-NEXT: sub x8, x8, #2 +; CHECK-LEGALIZATION-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-LEGALIZATION-NEXT: mov x10, sp +; CHECK-LEGALIZATION-NEXT: cmp x8, #2 +; CHECK-LEGALIZATION-NEXT: mov z0.d, p0/m, z1.d +; CHECK-LEGALIZATION-NEXT: ptrue p0.d +; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: cmp x8, #4 +; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-LEGALIZATION-NEXT: str q2, [x10, x9] +; CHECK-LEGALIZATION-NEXT: mov w9, #4 // =0x4 +; CHECK-LEGALIZATION-NEXT: addvl x10, sp, #1 +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: cmp x8, #6 +; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: str q3, [x10, x9] +; CHECK-LEGALIZATION-NEXT: mov w9, #6 // =0x6 +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: csel x8, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: addvl x9, sp, #2 +; CHECK-LEGALIZATION-NEXT: lsl x8, x8, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: str q4, [x9, x8] +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: addvl sp, sp, #3 +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-LEGALIZATION-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa_offset 0 +; CHECK-LEGALIZATION-NEXT: .cfi_restore w29 +; CHECK-LEGALIZATION-NEXT: ret +; ; CHECK-LABEL: test_nxv2f64_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG -; CHECK-NEXT: ptrue p1.d, vl2 -; CHECK-NEXT: cntd x8 -; CHECK-NEXT: mov w9, #2 // =0x2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub x8, x8, #2 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: mov z0.d, p1/m, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q2, [x10, x9] -; CHECK-NEXT: mov w9, #4 // =0x4 -; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: cmp x8, #6 -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str q3, [x10, x9] -; CHECK-NEXT: mov w9, #6 // =0x6 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: addvl x9, sp, #2 -; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: str q4, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: addvl sp, sp, #3 -; CHECK-NEXT: .cfi_def_cfa wsp, 16 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: ret +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov w9, #2 // =0x2 +; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str q2, [x10, x9] +; CHECK-NEXT: mov w9, #4 // =0x4 +; CHECK-NEXT: addvl x10, sp, #1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: cmp x8, #6 +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str q3, [x10, x9] +; CHECK-NEXT: mov w9, #6 // =0x6 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: addvl x9, sp, #2 +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str q4, [x9, x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll index 1d9cb88260b609..c0c0ae5c9d1fe9 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -111,6 +111,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_undef1: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movi v3.4s, #25 ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s @@ -118,9 +119,8 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: sshr v2.4s, v1.4s, #3 ; CHECK-NEXT: usra v2.4s, v1.4s, #31 -; CHECK-NEXT: movi v1.4s, #25 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -134,6 +134,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_undef1: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movi v3.4s, #100 ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s @@ -141,9 +142,8 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: sshr v2.4s, v1.4s, #5 ; CHECK-NEXT: usra v2.4s, v1.4s, #31 -; CHECK-NEXT: movi v1.4s, #100 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -201,11 +201,11 @@ define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind { define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_int_min: ; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v1.4s, v0.4s, #0 -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: usra v2.4s, v1.4s, #1 +; CHECK-NEXT: cmlt v2.4s, v0.4s, #0 +; CHECK-NEXT: mov v3.16b, v0.16b ; CHECK-NEXT: movi v1.4s, #128, lsl #24 -; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: usra v3.4s, v2.4s, #1 +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll index 0598af7c980635..a74f0c86fe1859 100644 --- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -245,6 +245,7 @@ define <4 x i32> @fold_srem_v4i32(<4 x i32> %x) { ; CHECK-LABEL: fold_srem_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #26215 // =0x6667 +; CHECK-NEXT: movi v3.4s, #10 ; CHECK-NEXT: movk w8, #26214, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s @@ -252,8 +253,7 @@ define <4 x i32> @fold_srem_v4i32(<4 x i32> %x) { ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: sshr v2.4s, v1.4s, #2 ; CHECK-NEXT: usra v2.4s, v1.4s, #31 -; CHECK-NEXT: movi v1.4s, #10 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-NEXT: ret %1 = srem <4 x i32> %x, ret <4 x i32> %1 diff --git a/llvm/test/CodeGen/AArch64/sve-abd.ll b/llvm/test/CodeGen/AArch64/sve-abd.ll index 95aec0a4926199..7b492229e3d23d 100644 --- a/llvm/test/CodeGen/AArch64/sve-abd.ll +++ b/llvm/test/CodeGen/AArch64/sve-abd.ll @@ -24,10 +24,10 @@ define @sabd_b( %a, %b) define @sabd_b_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: sabd_b_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sabd z0.b, p2/m, z0.b, z1.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: sabd z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret %a.sext = sext %a to %b.sext = sext %b to @@ -144,10 +144,10 @@ define @uabd_b( %a, %b) define @uabd_b_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_b_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 ; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 -; CHECK-NEXT: uabd z0.b, p2/m, z0.b, z1.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uabd z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -173,9 +173,9 @@ define @uabd_h( %a, %b) define @uabd_h_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_h_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: uabd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %a.zext = zext %a to @@ -202,9 +202,9 @@ define @uabd_s( %a, %b) define @uabd_s_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_s_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: and z1.s, z1.s, #0xffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %a.zext = zext %a to @@ -231,9 +231,9 @@ define @uabd_d( %a, %b) define @uabd_d_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_d_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uabd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %a.zext = zext %a to @@ -248,8 +248,8 @@ define @uabd_d_promoted_ops( %a, @uabd_non_matching_extension( %a, %b) #0 { ; CHECK-LABEL: uabd_non_matching_extension: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0xff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %a.zext = zext %a to @@ -265,9 +265,9 @@ define @uabd_non_matching_extension( %a, @uabd_non_matching_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_non_matching_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: and z1.s, z1.s, #0xffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %a.zext = zext %a to diff --git a/llvm/test/CodeGen/AArch64/sve-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-bitcast.ll index 7dd568fc837a3b..95f43ba5126323 100644 --- a/llvm/test/CodeGen/AArch64/sve-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-bitcast.ll @@ -1698,8 +1698,8 @@ define @bitcast_nxv8i8_to_nxv1i64( %v) #0 { ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.b ; CHECK_BE-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK_BE-NEXT: ptrue p0.b ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1b { z0.b }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -1720,8 +1720,8 @@ define @bitcast_nxv4i16_to_nxv1i64( %v) #0 ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.h ; CHECK_BE-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK_BE-NEXT: ptrue p0.h ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1h { z0.h }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -1742,8 +1742,8 @@ define @bitcast_nxv2i32_to_nxv1i64( %v) #0 ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1w { z0.s }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -2218,8 +2218,8 @@ define @bitcast_nxv8i8_to_nxv1f64( %v) #0 ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.b ; CHECK_BE-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK_BE-NEXT: ptrue p0.b ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1b { z0.b }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -2240,8 +2240,8 @@ define @bitcast_nxv4i16_to_nxv1f64( %v) ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.h ; CHECK_BE-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK_BE-NEXT: ptrue p0.h ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1h { z0.h }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -2262,8 +2262,8 @@ define @bitcast_nxv2i32_to_nxv1f64( %v) ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1w { z0.s }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -2827,11 +2827,11 @@ define @bitcast_nxv2f16_to_nxv1i32( %v) #0 ; CHECK_BE-NEXT: addvl sp, sp, #-2 ; CHECK_BE-NEXT: ptrue p0.d ; CHECK_BE-NEXT: ptrue p1.h -; CHECK_BE-NEXT: ptrue p2.s ; CHECK_BE-NEXT: st1h { z0.d }, p0, [sp] +; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: ld1h { z0.h }, p1/z, [sp] ; CHECK_BE-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] -; CHECK_BE-NEXT: ld1w { z0.s }, p2/z, [sp, #1, mul vl] +; CHECK_BE-NEXT: ld1w { z0.s }, p0/z, [sp, #1, mul vl] ; CHECK_BE-NEXT: addvl sp, sp, #2 ; CHECK_BE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK_BE-NEXT: ret @@ -2860,11 +2860,11 @@ define @bitcast_nxv2bf16_to_nxv1i32( %v) ; CHECK_BE-NEXT: addvl sp, sp, #-2 ; CHECK_BE-NEXT: ptrue p0.d ; CHECK_BE-NEXT: ptrue p1.h -; CHECK_BE-NEXT: ptrue p2.s ; CHECK_BE-NEXT: st1h { z0.d }, p0, [sp] +; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: ld1h { z0.h }, p1/z, [sp] ; CHECK_BE-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] -; CHECK_BE-NEXT: ld1w { z0.s }, p2/z, [sp, #1, mul vl] +; CHECK_BE-NEXT: ld1w { z0.s }, p0/z, [sp, #1, mul vl] ; CHECK_BE-NEXT: addvl sp, sp, #2 ; CHECK_BE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK_BE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll index 56b023086ea244..3b7b03e6ef61fe 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -64,12 +64,12 @@ define float @foo2(ptr %x0, ptr %x1) nounwind { ; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w2, #2 // =0x2 ; CHECK-NEXT: mov w3, #3 // =0x3 +; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: mov w4, #4 // =0x4 ; CHECK-NEXT: mov w5, #5 // =0x5 ; CHECK-NEXT: mov w6, #6 // =0x6 ; CHECK-NEXT: mov w7, #7 // =0x7 -; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] -; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w1, #1 // =0x1 @@ -182,8 +182,8 @@ entry: define double @foo5(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, ptr %ptr1, ptr %ptr2, double %x0, %x1, %x2) nounwind { ; CHECK-LABEL: foo5: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr x8, [sp] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8, #1, mul vl] ; CHECK-NEXT: ld1d { z6.d }, p0/z, [x8] ; CHECK-NEXT: ld1d { z7.d }, p0/z, [x8, #3, mul vl] @@ -229,10 +229,10 @@ entry: define void @aavpcs1(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 %s6, %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, ptr %ptr) nounwind { ; CHECK-LABEL: aavpcs1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1w { z24.s }, p0/z, [x7] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8] ; CHECK-NEXT: st1w { z0.s }, p0, [x9] ; CHECK-NEXT: st1w { z1.s }, p0, [x9] ; CHECK-NEXT: st1w { z2.s }, p0, [x9] @@ -261,12 +261,12 @@ entry: define void @aavpcs2(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16,ptr %ptr) nounwind { ; CHECK-LABEL: aavpcs2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x7] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x6] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x5] ; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1] ; CHECK-NEXT: ld1w { z6.s }, p0/z, [x4] @@ -299,11 +299,10 @@ entry: define void @aavpcs3(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, %s17, %p0, ptr %ptr) nounwind { ; CHECK-LABEL: aavpcs3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ldr x9, [sp, #16] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x6] @@ -311,15 +310,16 @@ define void @aavpcs3(float %s0, float %s1, float %s2, float %s3, float %s4, floa ; CHECK-NEXT: ld1w { z6.s }, p0/z, [x2] ; CHECK-NEXT: ld1w { z7.s }, p0/z, [x4] ; CHECK-NEXT: ld1w { z24.s }, p0/z, [x3] -; CHECK-NEXT: st1w { z0.s }, p0, [x9] -; CHECK-NEXT: st1w { z3.s }, p0, [x9] -; CHECK-NEXT: st1w { z6.s }, p0, [x9] -; CHECK-NEXT: st1w { z24.s }, p0, [x9] -; CHECK-NEXT: st1w { z7.s }, p0, [x9] -; CHECK-NEXT: st1w { z5.s }, p0, [x9] -; CHECK-NEXT: st1w { z4.s }, p0, [x9] -; CHECK-NEXT: st1w { z2.s }, p0, [x9] -; CHECK-NEXT: st1w { z1.s }, p0, [x9] +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: st1w { z3.s }, p0, [x8] +; CHECK-NEXT: st1w { z6.s }, p0, [x8] +; CHECK-NEXT: st1w { z24.s }, p0, [x8] +; CHECK-NEXT: st1w { z7.s }, p0, [x8] +; CHECK-NEXT: st1w { z5.s }, p0, [x8] +; CHECK-NEXT: st1w { z4.s }, p0, [x8] +; CHECK-NEXT: st1w { z2.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret entry: store volatile %s8, ptr %ptr @@ -339,8 +339,8 @@ entry: define void @aavpcs4(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 %s6, i32 %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, %s17, ptr %ptr) nounwind { ; CHECK-LABEL: aavpcs4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x8, [sp] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x9, [sp, #16] ; CHECK-NEXT: ld1w { z24.s }, p0/z, [x8] ; CHECK-NEXT: st1w { z0.s }, p0, [x9] @@ -371,11 +371,10 @@ entry: define @aavpcs5(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, %s17, ptr %ptr) nounwind { ; CHECK-LABEL: aavpcs5: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ldr x9, [sp, #16] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x6] @@ -383,15 +382,16 @@ define @aavpcs5(float %s0, float %s1, float %s2, float %s3, ; CHECK-NEXT: ld1w { z6.s }, p0/z, [x2] ; CHECK-NEXT: ld1w { z7.s }, p0/z, [x4] ; CHECK-NEXT: ld1w { z24.s }, p0/z, [x3] -; CHECK-NEXT: st1w { z0.s }, p0, [x9] -; CHECK-NEXT: st1w { z3.s }, p0, [x9] -; CHECK-NEXT: st1w { z6.s }, p0, [x9] -; CHECK-NEXT: st1w { z24.s }, p0, [x9] -; CHECK-NEXT: st1w { z7.s }, p0, [x9] -; CHECK-NEXT: st1w { z5.s }, p0, [x9] -; CHECK-NEXT: st1w { z4.s }, p0, [x9] -; CHECK-NEXT: st1w { z2.s }, p0, [x9] -; CHECK-NEXT: st1w { z1.s }, p0, [x9] +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z3.s }, p0, [x8] +; CHECK-NEXT: st1w { z6.s }, p0, [x8] +; CHECK-NEXT: st1w { z24.s }, p0, [x8] +; CHECK-NEXT: st1w { z7.s }, p0, [x8] +; CHECK-NEXT: st1w { z5.s }, p0, [x8] +; CHECK-NEXT: st1w { z4.s }, p0, [x8] +; CHECK-NEXT: st1w { z2.s }, p0, [x8] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] ; CHECK-NEXT: ret entry: store volatile %s8, ptr %ptr @@ -409,11 +409,10 @@ entry: define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, %s17, ptr %ptr) nounwind { ; CHECK-LABEL: aapcs1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ldr x9, [sp, #16] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x6] @@ -421,15 +420,16 @@ define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float ; CHECK-NEXT: ld1w { z6.s }, p0/z, [x2] ; CHECK-NEXT: ld1w { z7.s }, p0/z, [x4] ; CHECK-NEXT: ld1w { z16.s }, p0/z, [x3] -; CHECK-NEXT: st1w { z0.s }, p0, [x9] -; CHECK-NEXT: st1w { z3.s }, p0, [x9] -; CHECK-NEXT: st1w { z6.s }, p0, [x9] -; CHECK-NEXT: st1w { z16.s }, p0, [x9] -; CHECK-NEXT: st1w { z7.s }, p0, [x9] -; CHECK-NEXT: st1w { z5.s }, p0, [x9] -; CHECK-NEXT: st1w { z4.s }, p0, [x9] -; CHECK-NEXT: st1w { z2.s }, p0, [x9] -; CHECK-NEXT: st1w { z1.s }, p0, [x9] +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: st1w { z3.s }, p0, [x8] +; CHECK-NEXT: st1w { z6.s }, p0, [x8] +; CHECK-NEXT: st1w { z16.s }, p0, [x8] +; CHECK-NEXT: st1w { z7.s }, p0, [x8] +; CHECK-NEXT: st1w { z5.s }, p0, [x8] +; CHECK-NEXT: st1w { z4.s }, p0, [x8] +; CHECK-NEXT: st1w { z2.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret entry: store volatile %s8, ptr %ptr @@ -486,13 +486,13 @@ define void @non_sve_caller_high_range_non_sve_callee_high_range(float %f0, floa ; CHECK-NEXT: fmov s2, #2.00000000 ; CHECK-NEXT: fmov s3, #3.00000000 ; CHECK-NEXT: fmov s4, #4.00000000 -; CHECK-NEXT: fmov s5, #5.00000000 -; CHECK-NEXT: fmov s6, #6.00000000 -; CHECK-NEXT: fmov s7, #7.00000000 ; CHECK-NEXT: ld1w { z16.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z17.s }, p0/z, [x1] ; CHECK-NEXT: addvl x0, sp, #1 +; CHECK-NEXT: fmov s5, #5.00000000 +; CHECK-NEXT: fmov s6, #6.00000000 ; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: fmov s7, #7.00000000 ; CHECK-NEXT: st1w { z17.s }, p0, [sp] ; CHECK-NEXT: st1w { z16.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: bl non_sve_callee_high_range @@ -548,20 +548,20 @@ define @sve_caller_non_sve_callee_high_range( %a, %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff -; CHECK-NEXT: whilels p1.s, xzr, x8 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb w8, p1, z0.s +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb w8, p0, z0.s ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %vcond = fcmp oeq %a, %b diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll index f5721cd0fd7936..7bc31d44bb6547 100644 --- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll +++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll @@ -87,8 +87,8 @@ define float @fmaximum_f32( %a, %b) { define i32 @add_i32( %a, %b) { ; CHECK-LABEL: add_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add z0.s, z0.s, z2.s ; CHECK-NEXT: uaddv d0, p0, z0.s ; CHECK-NEXT: fmov x0, d0 @@ -160,8 +160,8 @@ define i16 @add_ext_v32i16( %a, %b) { define i32 @and_i32( %a, %b) { ; CHECK-LABEL: and_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -175,8 +175,8 @@ define i32 @and_i32( %a, %b) { define i32 @or_i32( %a, %b) { ; CHECK-LABEL: or_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: orr z0.d, z0.d, z2.d ; CHECK-NEXT: orv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -190,8 +190,8 @@ define i32 @or_i32( %a, %b) { define i32 @xor_i32( %a, %b) { ; CHECK-LABEL: xor_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: eor3 z0.d, z0.d, z1.d, z2.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll index fe5cdc93877283..180c64e0a7de14 100644 --- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll @@ -10,8 +10,8 @@ define @sdiv_i8( %a) #0 { ; CHECK-LABEL: sdiv_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z1.b, #86 // =0x56 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: lsr z1.b, z0.b, #7 ; CHECK-NEXT: add z0.b, z0.b, z1.b @@ -23,8 +23,8 @@ define @sdiv_i8( %a) #0 { define @sdiv_i16( %a) #0 { ; CHECK-LABEL: sdiv_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #21846 // =0x5556 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: lsr z1.h, z0.h, #15 @@ -37,8 +37,8 @@ define @sdiv_i16( %a) #0 { define @sdiv_i32( %a) #0 { ; CHECK-LABEL: sdiv_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #21846 // =0x5556 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movk w8, #21845, lsl #16 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s @@ -52,8 +52,8 @@ define @sdiv_i32( %a) #0 { define @sdiv_i64( %a) #0 { ; CHECK-LABEL: sdiv_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movk x8, #21846 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d @@ -71,8 +71,8 @@ define @sdiv_i64( %a) #0 { define @udiv_i8( %a) #0 { ; CHECK-LABEL: udiv_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z1.b, #-85 // =0xffffffffffffffab +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: lsr z0.b, z0.b, #1 ; CHECK-NEXT: ret @@ -83,8 +83,8 @@ define @udiv_i8( %a) #0 { define @udiv_i16( %a) #0 { ; CHECK-LABEL: udiv_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #-21845 // =0xffffaaab +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: lsr z0.h, z0.h, #1 @@ -96,8 +96,8 @@ define @udiv_i16( %a) #0 { define @udiv_i32( %a) #0 { ; CHECK-LABEL: udiv_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s @@ -110,8 +110,8 @@ define @udiv_i32( %a) #0 { define @udiv_i64( %a) #0 { ; CHECK-LABEL: udiv_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movk x8, #43691 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll index a3c34b53baa079..6d4f5963881e58 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll @@ -616,8 +616,8 @@ define i1 @test_last_8xi1( %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: whilels p1.h, xzr, x8 -; CHECK-NEXT: lastb w8, p1, z0.h +; CHECK-NEXT: whilels p0.h, xzr, x8 +; CHECK-NEXT: lastb w8, p0, z0.h ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() @@ -630,10 +630,10 @@ define i1 @test_last_8xi1( %a) #0 { define i1 @test_lanex_4xi1( %a, i32 %x) #0 { ; CHECK-LABEL: test_lanex_4xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: whilels p1.s, xzr, x8 -; CHECK-NEXT: lastb w8, p1, z0.s +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb w8, p0, z0.s ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %b = extractelement %a, i32 %x diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll index bc1c563810f358..b9c531fe335261 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll @@ -100,16 +100,16 @@ define <2 x i64> @extract_v2i64_nxv8i64_8( %arg) { ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: cmp x8, #8 +; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] @@ -183,10 +183,10 @@ define <4 x i1> @extract_v4i1_nxv32i1_16( %arg) { ; CHECK-NEXT: addvl sp, sp, #-8 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: st1b { z0.b }, p2, [sp, #1, mul vl] ; CHECK-NEXT: st1b { z1.b }, p2, [sp] diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index e2f8dad03ef6f3..88268104889fde 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -17,15 +17,15 @@ define <2 x i64> @extract_v2i64_nxv2i64_idx2( %vec) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x8 ; CHECK-NEXT: mov w9, #2 // =0x2 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -50,15 +50,15 @@ define <4 x i32> @extract_v4i32_nxv4i32_idx4( %vec) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cntw x8 ; CHECK-NEXT: mov w9, #4 // =0x4 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sub x8, x8, #4 ; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -115,15 +115,15 @@ define <8 x i16> @extract_v8i16_nxv8i16_idx8( %vec) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: sub x8, x8, #8 ; CHECK-NEXT: cmp x8, #8 +; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -214,14 +214,14 @@ define <16 x i8> @extract_v16i8_nxv16i8_idx16( %vec) nounwind ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: sub x8, x8, #16 ; CHECK-NEXT: cmp x8, #16 +; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll index e60a2f142922fd..3c0bd501f45d8b 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll @@ -65,27 +65,25 @@ define @extract_nxv14i1_nxv28i1_14( %in) uw ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: punpkhi p2.h, p1.b -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: punpklo p1.h, p1.b ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: punpklo p2.h, p2.b ; CHECK-NEXT: punpkhi p3.h, p1.b -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpkhi p4.h, p2.b ; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: uzp1 p4.s, p4.s, p0.s +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpkhi p5.h, p3.b -; CHECK-NEXT: punpklo p3.h, p3.b -; CHECK-NEXT: punpkhi p6.h, p1.b ; CHECK-NEXT: punpklo p1.h, p1.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uzp1 p2.s, p5.s, p2.s +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: punpkhi p5.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p3.s, p5.s, p3.s ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p3.s, p6.s, p3.s -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p4.s, p4.s, p0.s ; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s ; CHECK-NEXT: uzp1 p1.h, p2.h, p4.h ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-fcmp.ll b/llvm/test/CodeGen/AArch64/sve-fcmp.ll index f7e3b6d0171ac3..35cbe65c6a8b86 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcmp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcmp.ll @@ -374,8 +374,8 @@ define @one_zero( %x) { define @ueq_zero( %x) { ; CHECK-LABEL: ueq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcmuo p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b diff --git a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll index f15807597ac217..78843e392e5367 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll @@ -63,10 +63,10 @@ define @test_copysign_v4f32_v4f64( %a, ; CHECK-EXTEND-ROUND-NEXT: ptrue p0.d ; CHECK-EXTEND-ROUND-NEXT: uunpkhi z3.d, z0.s ; CHECK-EXTEND-ROUND-NEXT: uunpklo z0.d, z0.s -; CHECK-EXTEND-ROUND-NEXT: and z3.s, z3.s, #0x7fffffff -; CHECK-EXTEND-ROUND-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-EXTEND-ROUND-NEXT: fcvt z2.s, p0/m, z2.d ; CHECK-EXTEND-ROUND-NEXT: fcvt z1.s, p0/m, z1.d +; CHECK-EXTEND-ROUND-NEXT: and z3.s, z3.s, #0x7fffffff +; CHECK-EXTEND-ROUND-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-EXTEND-ROUND-NEXT: and z2.s, z2.s, #0x80000000 ; CHECK-EXTEND-ROUND-NEXT: and z1.s, z1.s, #0x80000000 ; CHECK-EXTEND-ROUND-NEXT: orr z2.d, z3.d, z2.d @@ -115,9 +115,9 @@ declare @llvm.copysign.v2f64( %a, @test_copysign_v4f64_v4f32( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z3.d, z2.s ; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff ; CHECK-NEXT: fcvt z3.d, p0/m, z3.s @@ -193,10 +193,10 @@ define @test_copysign_v4f16_v4f64( %a, @test_copysign_v8f16_v8f32( %a, @test_copysign_v8f16_v8f32( %a, @test_copysign_nxv4f32_nxv4f16( %a, %b) #0 { ; CHECK-NO-EXTEND-ROUND-LABEL: test_copysign_nxv4f32_nxv4f16: ; CHECK-NO-EXTEND-ROUND: // %bb.0: -; CHECK-NO-EXTEND-ROUND-NEXT: ptrue p0.s ; CHECK-NO-EXTEND-ROUND-NEXT: and z1.s, z1.s, #0x80000000 ; CHECK-NO-EXTEND-ROUND-NEXT: and z0.s, z0.s, #0x7fffffff +; CHECK-NO-EXTEND-ROUND-NEXT: ptrue p0.s ; CHECK-NO-EXTEND-ROUND-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NO-EXTEND-ROUND-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NO-EXTEND-ROUND-NEXT: ret @@ -285,9 +285,9 @@ define @test_copysign_nxv4f32_nxv4f16( % define @test_copysign_nxv2f64_nxv2f32( %a, %b) #0 { ; CHECK-NO-EXTEND-ROUND-LABEL: test_copysign_nxv2f64_nxv2f32: ; CHECK-NO-EXTEND-ROUND: // %bb.0: -; CHECK-NO-EXTEND-ROUND-NEXT: ptrue p0.d ; CHECK-NO-EXTEND-ROUND-NEXT: and z1.d, z1.d, #0x8000000000000000 ; CHECK-NO-EXTEND-ROUND-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NO-EXTEND-ROUND-NEXT: ptrue p0.d ; CHECK-NO-EXTEND-ROUND-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NO-EXTEND-ROUND-NEXT: fcvt z0.s, p0/m, z0.d ; CHECK-NO-EXTEND-ROUND-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll index 0fe38bf9ae718a..fc5128fffad36a 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll @@ -454,9 +454,9 @@ define @fcvtzu_d_nxv2f64( %a) { define @scvtf_h_nxv2i1( %a) { ; CHECK-LABEL: scvtf_h_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.h, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.h, p0/m, z0.d ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -495,9 +495,9 @@ define @scvtf_h_nxv2i64( %a) { define @scvtf_h_nxv3i1( %a) { ; CHECK-LABEL: scvtf_h_nxv3i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -516,9 +516,9 @@ define @scvtf_h_nxv3i16( %a) { define @scvtf_h_nxv4i1( %a) { ; CHECK-LABEL: scvtf_h_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -547,9 +547,9 @@ define @scvtf_h_nxv4i32( %a) { define @scvtf_h_nxv7i1( %a) { ; CHECK-LABEL: scvtf_h_nxv7i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.h, p1/m, z0.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: scvtf z0.h, p0/m, z0.h ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -568,9 +568,9 @@ define @scvtf_h_nxv7i16( %a) { define @scvtf_h_nxv8i1( %a) { ; CHECK-LABEL: scvtf_h_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.h, p1/m, z0.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: scvtf z0.h, p0/m, z0.h ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -589,9 +589,9 @@ define @scvtf_h_nxv8i16( %a) { define @scvtf_s_nxv2i1( %a) { ; CHECK-LABEL: scvtf_s_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.s, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -620,9 +620,9 @@ define @scvtf_s_nxv2i64( %a) { define @scvtf_s_nxv3i1( %a) { ; CHECK-LABEL: scvtf_s_nxv3i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.s, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -641,9 +641,9 @@ define @scvtf_s_nxv3i32( %a) { define @scvtf_s_nxv4i1( %a) { ; CHECK-LABEL: scvtf_s_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.s, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -662,9 +662,9 @@ define @scvtf_s_nxv4i32( %a) { define @scvtf_d_nxv2i1( %a) { ; CHECK-LABEL: scvtf_d_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -695,9 +695,9 @@ define @scvtf_d_nxv2i64( %a) { define @ucvtf_h_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.h, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -736,9 +736,9 @@ define @ucvtf_h_nxv2i64( %a) { define @ucvtf_h_nxv3i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv3i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -767,9 +767,9 @@ define @ucvtf_h_nxv3i32( %a) { define @ucvtf_h_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -798,9 +798,9 @@ define @ucvtf_h_nxv4i32( %a) { define @ucvtf_h_nxv8i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.h, p1/m, z0.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -819,9 +819,9 @@ define @ucvtf_h_nxv8i16( %a) { define @ucvtf_s_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_s_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.s, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -850,9 +850,9 @@ define @ucvtf_s_nxv2i64( %a) { define @ucvtf_s_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_s_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.s, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -871,9 +871,9 @@ define @ucvtf_s_nxv4i32( %a) { define @ucvtf_d_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_d_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ret %res = uitofp %a to ret %res diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll index ed7ea657874a4a..28e1412c524a07 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll @@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu" define void @masked_gather_base_plus_stride_v8f32(ptr %dst, ptr %src) #0 { ; CHECK-LABEL: masked_gather_base_plus_stride_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: index z0.s, #0, #7 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1, z0.s, sxtw #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -21,8 +21,8 @@ define void @masked_gather_base_plus_stride_v8f32(ptr %dst, ptr %src) #0 { define void @masked_gather_base_plus_stride_v4f64(ptr %dst, ptr %src) #0 { ; CHECK-LABEL: masked_gather_base_plus_stride_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mov x8, #-32 // =0xffffffffffffffe0 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: index z0.d, #-2, x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1, z0.d, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll index ad482118ec0bbe..47fda39d840019 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll @@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu" define void @build_vector_7_inc1_v32i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: build_vector_7_inc1_v32i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: index z0.b, #7, #1 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret store <32 x i8> , ptr %a, align 1 @@ -18,8 +18,8 @@ define void @build_vector_7_inc1_v32i8(ptr %a) #0 { define void @build_vector_0_inc2_v16i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: build_vector_0_inc2_v16i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: index z0.h, #0, #2 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret store <16 x i16> , ptr %a, align 2 @@ -30,8 +30,8 @@ define void @build_vector_0_inc2_v16i16(ptr %a) #0 { define void @build_vector_0_dec3_v8i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: build_vector_0_dec3_v8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: index z0.s, #0, #-3 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret store <8 x i32> , ptr %a, align 4 @@ -42,8 +42,8 @@ define void @build_vector_0_dec3_v8i32(ptr %a) #0 { define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: build_vector_minus2_dec32_v4i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #-32 // =0xffffffffffffffe0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: index z0.d, #-2, x8 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -53,11 +53,6 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 { ; Constant but not a sequence. define void @build_vector_no_stride_v4i64(ptr %a) #0 { -; VBITS_GE_256-LABEL: .LCPI4_0: -; VBITS_GE_256: .xword 0 -; VBITS_GE_256-NEXT: .xword 4 -; VBITS_GE_256-NEXT: .xword 1 -; VBITS_GE_256-NEXT: .xword 8 ; VBITS_GE_256-LABEL: build_vector_no_stride_v4i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll index 65cb448cac117c..f7751131005e30 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll @@ -38,9 +38,9 @@ define void @concat_v32i8(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.b, vl32 ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2] +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: st1b { z0.b }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x i8>, ptr %a %op2 = load <16 x i8>, ptr %b @@ -66,11 +66,11 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v64i8: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl32 -; VBITS_GE_512-NEXT: ptrue p1.b, vl64 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.b, p0, z0.b, z1.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -90,11 +90,11 @@ define void @concat_v128i8(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v128i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2] +; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: st1b { z0.b }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <64 x i8>, ptr %a %op2 = load <64 x i8>, ptr %b @@ -122,11 +122,11 @@ define void @concat_v256i8(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v256i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ptrue p1.b, vl256 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2] +; CHECK-NEXT: ptrue p0.b, vl256 +; CHECK-NEXT: st1b { z0.b }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -198,9 +198,9 @@ define void @concat_v16i16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.h, vl16 ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <8 x i16>, ptr %a %op2 = load <8 x i16>, ptr %b @@ -224,11 +224,11 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ptrue p1.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.h, p0, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -244,11 +244,11 @@ define void @concat_v64i16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v64i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <32 x i16>, ptr %a %op2 = load <32 x i16>, ptr %b @@ -268,11 +268,11 @@ define void @concat_v128i16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v128i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ptrue p1.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a %op2 = load <64 x i16>, ptr %b @@ -328,9 +328,9 @@ define void @concat_v8i32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.s, vl8 ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <4 x i32>, ptr %a %op2 = load <4 x i32>, ptr %b @@ -353,11 +353,11 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ptrue p1.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.s, p0, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -371,11 +371,11 @@ define void @concat_v32i32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x i32>, ptr %a %op2 = load <16 x i32>, ptr %b @@ -391,11 +391,11 @@ define void @concat_v64i32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v64i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <32 x i32>, ptr %a %op2 = load <32 x i32>, ptr %b @@ -433,9 +433,9 @@ define void @concat_v4i64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.d, vl4 ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b @@ -458,11 +458,11 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl4 -; VBITS_GE_512-NEXT: ptrue p1.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.d, p0, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -475,11 +475,11 @@ define void @concat_v16i64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ptrue p1.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <8 x i64>, ptr %a %op2 = load <8 x i64>, ptr %b @@ -493,11 +493,11 @@ define void @concat_v32i64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v32i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x i64>, ptr %a %op2 = load <16 x i64>, ptr %b @@ -541,9 +541,9 @@ define void @concat_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.h, vl16 ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <8 x half>, ptr %a %op2 = load <8 x half>, ptr %b @@ -567,11 +567,11 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v32f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ptrue p1.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.h, p0, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -587,11 +587,11 @@ define void @concat_v64f16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v64f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <32 x half>, ptr %a %op2 = load <32 x half>, ptr %b @@ -611,11 +611,11 @@ define void @concat_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v128f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ptrue p1.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <64 x half>, ptr %a %op2 = load <64 x half>, ptr %b @@ -671,9 +671,9 @@ define void @concat_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.s, vl8 ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <4 x float>, ptr %a %op2 = load <4 x float>, ptr %b @@ -696,11 +696,11 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ptrue p1.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.s, p0, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -714,11 +714,11 @@ define void @concat_v32f32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x float>, ptr %a %op2 = load <16 x float>, ptr %b @@ -734,11 +734,11 @@ define void @concat_v64f32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v64f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <32 x float>, ptr %a %op2 = load <32 x float>, ptr %b @@ -776,9 +776,9 @@ define void @concat_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.d, vl4 ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <2 x double>, ptr %a %op2 = load <2 x double>, ptr %b @@ -801,11 +801,11 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl4 -; VBITS_GE_512-NEXT: ptrue p1.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.d, p0, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -818,11 +818,11 @@ define void @concat_v16f64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ptrue p1.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <8 x double>, ptr %a %op2 = load <8 x double>, ptr %b @@ -836,11 +836,11 @@ define void @concat_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v32f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %op2 = load <16 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll index 485124c1d59ed9..ad4efeaf39247a 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll @@ -70,9 +70,9 @@ define half @extractelement_v64f16(ptr %a) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov w8, #63 // =0x3f -; CHECK-NEXT: whilels p1.h, xzr, x8 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: lastb h0, p1, z0.h +; CHECK-NEXT: whilels p0.h, xzr, x8 +; CHECK-NEXT: lastb h0, p0, z0.h ; CHECK-NEXT: ret %op1 = load <64 x half>, ptr %a %r = extractelement <64 x half> %op1, i64 63 @@ -84,9 +84,9 @@ define half @extractelement_v128f16(ptr %a) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov w8, #127 // =0x7f -; CHECK-NEXT: whilels p1.h, xzr, x8 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: lastb h0, p1, z0.h +; CHECK-NEXT: whilels p0.h, xzr, x8 +; CHECK-NEXT: lastb h0, p0, z0.h ; CHECK-NEXT: ret %op1 = load <128 x half>, ptr %a %r = extractelement <128 x half> %op1, i64 127 @@ -154,9 +154,9 @@ define float @extractelement_v32f32(ptr %a) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: mov w8, #31 // =0x1f -; CHECK-NEXT: whilels p1.s, xzr, x8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: lastb s0, p1, z0.s +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb s0, p0, z0.s ; CHECK-NEXT: ret %op1 = load <32 x float>, ptr %a %r = extractelement <32 x float> %op1, i64 31 @@ -168,9 +168,9 @@ define float @extractelement_v64f32(ptr %a) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: mov w8, #63 // =0x3f -; CHECK-NEXT: whilels p1.s, xzr, x8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: lastb s0, p1, z0.s +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb s0, p0, z0.s ; CHECK-NEXT: ret %op1 = load <64 x float>, ptr %a %r = extractelement <64 x float> %op1, i64 63 @@ -236,9 +236,9 @@ define double @extractelement_v16f64(ptr %a) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: mov w8, #15 // =0xf -; CHECK-NEXT: whilels p1.d, xzr, x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: lastb d0, p1, z0.d +; CHECK-NEXT: whilels p0.d, xzr, x8 +; CHECK-NEXT: lastb d0, p0, z0.d ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %r = extractelement <16 x double> %op1, i64 15 @@ -250,9 +250,9 @@ define double @extractelement_v32f64(ptr %a) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: mov w8, #31 // =0x1f -; CHECK-NEXT: whilels p1.d, xzr, x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: lastb d0, p1, z0.d +; CHECK-NEXT: whilels p0.d, xzr, x8 +; CHECK-NEXT: lastb d0, p0, z0.d ; CHECK-NEXT: ret %op1 = load <32 x double>, ptr %a %r = extractelement <32 x double> %op1, i64 31 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll index bca3dfe5717efc..e77cd9ef55eaf2 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll @@ -396,9 +396,9 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mvni v1.4s, #128, lsl #24 ; CHECK-NEXT: ldr q2, [x0] -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] -; CHECK-NEXT: fcvt z0.s, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.s, p0/m, z0.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: bit v0.16b, v2.16b, v1.16b ; CHECK-NEXT: str q0, [x0] @@ -450,12 +450,12 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: ; CHECK_EXTEND_ROUND: // %bb.0: -; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d, vl4 ; CHECK_EXTEND_ROUND-NEXT: ldr q0, [x1] +; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d, vl4 ; CHECK_EXTEND_ROUND-NEXT: uunpklo z0.d, z0.s -; CHECK_EXTEND_ROUND-NEXT: fcvt z0.d, p0/m, z0.s ; CHECK_EXTEND_ROUND-NEXT: ld1d { z1.d }, p0/z, [x0] ; CHECK_EXTEND_ROUND-NEXT: and z1.d, z1.d, #0x7fffffffffffffff +; CHECK_EXTEND_ROUND-NEXT: fcvt z0.d, p0/m, z0.s ; CHECK_EXTEND_ROUND-NEXT: and z0.d, z0.d, #0x8000000000000000 ; CHECK_EXTEND_ROUND-NEXT: orr z0.d, z1.d, z0.d ; CHECK_EXTEND_ROUND-NEXT: st1d { z0.d }, p0, [x0] @@ -494,9 +494,9 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mvni v1.4h, #128, lsl #8 ; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] -; CHECK-NEXT: fcvt z0.h, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: bit v0.8b, v2.8b, v1.8b @@ -521,9 +521,9 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mvni v1.8h, #128, lsl #8 ; CHECK-NEXT: ldr q2, [x0] -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1] -; CHECK-NEXT: fcvt z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: bit v0.16b, v2.16b, v1.16b ; CHECK-NEXT: str q0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll index 6da07b855a5c57..b60988be1e76c7 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -462,11 +462,11 @@ define void @fcvt_v8f64_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.d ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: fcvt z0.h, p1/m, z0.d -; VBITS_GE_256-NEXT: fcvt z1.h, p1/m, z1.d +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.d +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.d ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll index 13ebda1df7f9d1..d1e9dc13f50e87 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll @@ -34,15 +34,15 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) vsca define void @select_v16f16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl16 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x half>, ptr %a %op2 = load volatile <16 x half>, ptr %b @@ -54,33 +54,33 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov z0.h, w2 +; VBITS_GE_256-NEXT: ptrue p0.h ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.h, p1, z1.h, z3.h -; VBITS_GE_256-NEXT: mov z0.h, p1/m, z2.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p1/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.h, p0, z0.h, z2.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v32f16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: mov z0.h, w2 -; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: ptrue p0.h +; VBITS_GE_512-NEXT: ptrue p1.h, vl32 ; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; VBITS_GE_512-NEXT: sel z0.h, p1, z1.h, z2.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.h, p0, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <32 x half>, ptr %a %op2 = load volatile <32 x half>, ptr %b @@ -92,15 +92,15 @@ define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v64f16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v64f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x half>, ptr %a %op2 = load volatile <64 x half>, ptr %b @@ -112,15 +112,15 @@ define void @select_v64f16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v128f16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v128f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl128 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <128 x half>, ptr %a %op2 = load volatile <128 x half>, ptr %b @@ -158,15 +158,15 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) v define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl8 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x float>, ptr %a %op2 = load volatile <8 x float>, ptr %b @@ -178,33 +178,33 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: and w8, w2, #0x1 -; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ptrue p0.s ; VBITS_GE_256-NEXT: mov z0.s, w8 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.s, p1, z1.s, z3.s -; VBITS_GE_256-NEXT: mov z0.s, p1/m, z2.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.s, p0, z0.s, z2.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: and w8, w2, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: mov z0.s, w8 -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; VBITS_GE_512-NEXT: sel z0.s, p1, z1.s, z2.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 +; VBITS_GE_512-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.s, p0, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <16 x float>, ptr %a %op2 = load volatile <16 x float>, ptr %b @@ -216,15 +216,15 @@ define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x float>, ptr %a %op2 = load volatile <32 x float>, ptr %b @@ -236,15 +236,15 @@ define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v64f32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl64 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x float>, ptr %a %op2 = load volatile <64 x float>, ptr %b @@ -282,16 +282,16 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl4 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x double>, ptr %a %op2 = load volatile <4 x double>, ptr %b @@ -303,35 +303,35 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_256-NEXT: and x8, x2, #0x1 -; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ptrue p0.d ; VBITS_GE_256-NEXT: mov z0.d, x8 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.d, p1, z1.d, z3.d -; VBITS_GE_256-NEXT: mov z0.d, p1/m, z2.d -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.d, p0, z0.d, z2.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_512-NEXT: and x8, x2, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: mov z0.d, x8 -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_512-NEXT: sel z0.d, p1, z1.d, z2.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.d, p0, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <8 x double>, ptr %a %op2 = load volatile <8 x double>, ptr %b @@ -343,16 +343,16 @@ define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x double>, ptr %a %op2 = load volatile <16 x double>, ptr %b @@ -364,16 +364,16 @@ define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v32f64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x double>, ptr %a %op2 = load volatile <32 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll index da0cf927d74d24..af54b146c5b663 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -131,8 +131,8 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 { define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: fcvtzu_v8f16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -357,7 +357,6 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: fcvtzu z0.s, p0/m, z0.s @@ -366,7 +365,8 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzu_v16f32_v16i16: @@ -532,8 +532,8 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 { define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: fcvtzu_v4f32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -752,7 +752,6 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: fcvtzu z0.d, p0/m, z0.d @@ -761,7 +760,8 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i32: @@ -1024,8 +1024,8 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 { define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: fcvtzs_v8f16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -1250,7 +1250,6 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: fcvtzs z0.s, p0/m, z0.s @@ -1259,7 +1258,8 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzs_v16f32_v16i16: @@ -1425,8 +1425,8 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 { define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: fcvtzs_v4f32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -1645,7 +1645,6 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: fcvtzs z0.d, p0/m, z0.d @@ -1654,7 +1653,8 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i32: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll index 115f722986b5c6..61e04682fa0bfb 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll @@ -85,10 +85,10 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: bl __trunctfdf2 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #128 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d @@ -111,14 +111,14 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload ; CHECK-NEXT: bl __trunctfdf2 ; CHECK-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: ptrue p1.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #128 -; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload ; CHECK-NEXT: mov x8, #4 // =0x4 -; CHECK-NEXT: splice z0.d, p1, z0.d, z1.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x19, x8, lsl #3] ; CHECK-NEXT: add x8, sp, #128 ; CHECK-NEXT: ldr z0, [x8, #1, mul vl] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll index a4b5ccd69fdb75..1bd688d23050b1 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll @@ -21,24 +21,24 @@ define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w29, -48 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x11, sp, #176 -; CHECK-NEXT: add x10, sp, #144 -; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: add x10, sp, #176 ; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: add x9, sp, #144 ; CHECK-NEXT: add x20, sp, #176 -; CHECK-NEXT: ldp x13, x12, [sp, #328] ; CHECK-NEXT: ldr x15, [sp, #104] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x10] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: add x8, sp, #112 +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x9] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] ; CHECK-NEXT: ldur q4, [sp, #88] -; CHECK-NEXT: ldp x16, x17, [sp, #208] +; CHECK-NEXT: ldp x9, x8, [sp, #328] ; CHECK-NEXT: ldr x19, [sp, #272] +; CHECK-NEXT: ldp x11, x10, [sp, #312] +; CHECK-NEXT: ldp x13, x12, [sp, #296] +; CHECK-NEXT: ldp x18, x14, [sp, #280] +; CHECK-NEXT: ldp x16, x17, [sp, #208] ; CHECK-NEXT: ldp x21, x22, [sp, #352] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x11] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x10] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] -; CHECK-NEXT: ldp x8, x14, [sp, #312] -; CHECK-NEXT: ldp x10, x9, [sp, #296] -; CHECK-NEXT: ldp x18, x11, [sp, #280] ; CHECK-NEXT: st1d { z3.d }, p0, [x20] ; CHECK-NEXT: add x20, sp, #144 ; CHECK-NEXT: st1d { z2.d }, p0, [x20] @@ -53,10 +53,10 @@ define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v ; CHECK-NEXT: stp x16, x17, [sp, #208] ; CHECK-NEXT: stur q4, [sp, #88] ; CHECK-NEXT: str x15, [sp, #104] -; CHECK-NEXT: stp x11, x10, [sp, #288] -; CHECK-NEXT: stp x9, x8, [sp, #304] -; CHECK-NEXT: stp x14, x13, [sp, #320] -; CHECK-NEXT: str x12, [sp, #336] +; CHECK-NEXT: stp x14, x13, [sp, #288] +; CHECK-NEXT: stp x12, x11, [sp, #304] +; CHECK-NEXT: stp x10, x9, [sp, #320] +; CHECK-NEXT: str x8, [sp, #336] ; CHECK-NEXT: ldr x29, [sp], #48 // 8-byte Folded Reload ; CHECK-NEXT: b func2 ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr %v13, ptr %v14, ptr %v15, ptr %v16, diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll index 977c528e2583af..6f4d257039bca1 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -36,16 +36,16 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 { define void @insertelement_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: index z0.h, #0, #1 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h -; CHECK-NEXT: fmov h0, #5.00000000 -; CHECK-NEXT: mov z2.h, p1/m, h0 -; CHECK-NEXT: st1h { z2.h }, p0, [x1] +; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: fmov h1, #5.00000000 +; CHECK-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEXT: st1h { z0.h }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %r = insertelement <16 x half> %op1, half 5.0, i64 15 @@ -56,33 +56,33 @@ define void @insertelement_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @insertelement_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: insertelement_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov w8, #15 // =0xf ; VBITS_GE_256-NEXT: index z0.h, #0, #1 -; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: ptrue p0.h ; VBITS_GE_256-NEXT: mov z1.h, w8 +; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: fmov h2, #5.00000000 -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: mov z3.h, p1/m, h2 -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] -; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; VBITS_GE_256-NEXT: fmov h0, #5.00000000 +; VBITS_GE_256-NEXT: ld1h { z1.h }, p1/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: mov z1.h, p0/m, h0 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p1, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: insertelement_v32f16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: mov w8, #31 // =0x1f ; VBITS_GE_512-NEXT: index z0.h, #0, #1 -; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: ptrue p0.h ; VBITS_GE_512-NEXT: mov z1.h, w8 -; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h -; VBITS_GE_512-NEXT: fmov h0, #5.00000000 -; VBITS_GE_512-NEXT: mov z2.h, p1/m, h0 -; VBITS_GE_512-NEXT: st1h { z2.h }, p0, [x1] +; VBITS_GE_512-NEXT: ptrue p1.h, vl32 +; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_512-NEXT: fmov h1, #5.00000000 +; VBITS_GE_512-NEXT: mov z0.h, p0/m, h1 +; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <32 x half>, ptr %a %r = insertelement <32 x half> %op1, half 5.0, i64 31 @@ -93,16 +93,16 @@ define void @insertelement_v32f16(ptr %a, ptr %b) #0 { define void @insertelement_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v64f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: index z0.h, #0, #1 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h -; CHECK-NEXT: fmov h0, #5.00000000 -; CHECK-NEXT: mov z2.h, p1/m, h0 -; CHECK-NEXT: st1h { z2.h }, p0, [x1] +; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: fmov h1, #5.00000000 +; CHECK-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEXT: st1h { z0.h }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <64 x half>, ptr %a %r = insertelement <64 x half> %op1, half 5.0, i64 63 @@ -113,16 +113,16 @@ define void @insertelement_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @insertelement_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v128f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: index z0.h, #0, #1 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h -; CHECK-NEXT: fmov h0, #5.00000000 -; CHECK-NEXT: mov z2.h, p1/m, h0 -; CHECK-NEXT: st1h { z2.h }, p0, [x1] +; CHECK-NEXT: ptrue p1.h, vl128 +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: fmov h1, #5.00000000 +; CHECK-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEXT: st1h { z0.h }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <128 x half>, ptr %a %r = insertelement <128 x half> %op1, half 5.0, i64 127 @@ -157,16 +157,16 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 { define void @insertelement_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s -; CHECK-NEXT: fmov s0, #5.00000000 -; CHECK-NEXT: mov z2.s, p1/m, s0 -; CHECK-NEXT: st1w { z2.s }, p0, [x1] +; CHECK-NEXT: ptrue p1.s, vl8 +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: fmov s1, #5.00000000 +; CHECK-NEXT: mov z0.s, p0/m, s1 +; CHECK-NEXT: st1w { z0.s }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %r = insertelement <8 x float> %op1, float 5.0, i64 7 @@ -177,33 +177,33 @@ define void @insertelement_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @insertelement_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: insertelement_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov w8, #7 // =0x7 ; VBITS_GE_256-NEXT: index z0.s, #0, #1 -; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ptrue p0.s ; VBITS_GE_256-NEXT: mov z1.s, w8 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: fmov s2, #5.00000000 -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: mov z3.s, p1/m, s2 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_256-NEXT: fmov s0, #5.00000000 +; VBITS_GE_256-NEXT: ld1w { z1.s }, p1/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov z1.s, p0/m, s0 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: insertelement_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: mov w8, #15 // =0xf ; VBITS_GE_512-NEXT: index z0.s, #0, #1 -; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: mov z1.s, w8 -; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s -; VBITS_GE_512-NEXT: fmov s0, #5.00000000 -; VBITS_GE_512-NEXT: mov z2.s, p1/m, s0 -; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_512-NEXT: fmov s1, #5.00000000 +; VBITS_GE_512-NEXT: mov z0.s, p0/m, s1 +; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, ptr %a %r = insertelement <16 x float> %op1, float 5.0, i64 15 @@ -214,16 +214,16 @@ define void @insertelement_v16f32(ptr %a, ptr %b) #0 { define void @insertelement_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s -; CHECK-NEXT: fmov s0, #5.00000000 -; CHECK-NEXT: mov z2.s, p1/m, s0 -; CHECK-NEXT: st1w { z2.s }, p0, [x1] +; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: fmov s1, #5.00000000 +; CHECK-NEXT: mov z0.s, p0/m, s1 +; CHECK-NEXT: st1w { z0.s }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <32 x float>, ptr %a %r = insertelement <32 x float> %op1, float 5.0, i64 31 @@ -234,16 +234,16 @@ define void @insertelement_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @insertelement_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s -; CHECK-NEXT: fmov s0, #5.00000000 -; CHECK-NEXT: mov z2.s, p1/m, s0 -; CHECK-NEXT: st1w { z2.s }, p0, [x1] +; CHECK-NEXT: ptrue p1.s, vl64 +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: fmov s1, #5.00000000 +; CHECK-NEXT: mov z0.s, p0/m, s1 +; CHECK-NEXT: st1w { z0.s }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <64 x float>, ptr %a %r = insertelement <64 x float> %op1, float 5.0, i64 63 @@ -276,16 +276,16 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 define void @insertelement_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d -; CHECK-NEXT: fmov d0, #5.00000000 -; CHECK-NEXT: mov z2.d, p1/m, d0 -; CHECK-NEXT: st1d { z2.d }, p0, [x1] +; CHECK-NEXT: ptrue p1.d, vl4 +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: fmov d1, #5.00000000 +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: st1d { z0.d }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %r = insertelement <4 x double> %op1, double 5.0, i64 3 @@ -296,33 +296,33 @@ define void @insertelement_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @insertelement_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: insertelement_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov w8, #3 // =0x3 ; VBITS_GE_256-NEXT: index z0.d, #0, #1 -; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ptrue p0.d ; VBITS_GE_256-NEXT: mov z1.d, x8 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: fmov d2, #5.00000000 -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: mov z3.d, p1/m, d2 -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_256-NEXT: fmov d0, #5.00000000 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: mov z1.d, p0/m, d0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: insertelement_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: mov w8, #7 // =0x7 ; VBITS_GE_512-NEXT: index z0.d, #0, #1 -; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: mov z1.d, x8 -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d -; VBITS_GE_512-NEXT: fmov d0, #5.00000000 -; VBITS_GE_512-NEXT: mov z2.d, p1/m, d0 -; VBITS_GE_512-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: fmov d1, #5.00000000 +; VBITS_GE_512-NEXT: mov z0.d, p0/m, d1 +; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, ptr %a %r = insertelement <8 x double> %op1, double 5.0, i64 7 @@ -333,16 +333,16 @@ define void @insertelement_v8f64(ptr %a, ptr %b) #0 { define void @insertelement_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d -; CHECK-NEXT: fmov d0, #5.00000000 -; CHECK-NEXT: mov z2.d, p1/m, d0 -; CHECK-NEXT: st1d { z2.d }, p0, [x1] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: fmov d1, #5.00000000 +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: st1d { z0.d }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %r = insertelement <16 x double> %op1, double 5.0, i64 15 @@ -353,16 +353,16 @@ define void @insertelement_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @insertelement_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d -; CHECK-NEXT: fmov d0, #5.00000000 -; CHECK-NEXT: mov z2.d, p1/m, d0 -; CHECK-NEXT: st1d { z2.d }, p0, [x1] +; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: fmov d1, #5.00000000 +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: st1d { z0.d }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <32 x double>, ptr %a %r = insertelement <32 x double> %op1, double 5.0, i64 31 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll index b45a62f4e7581a..58fca3a2cf8b6e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll @@ -1388,11 +1388,11 @@ define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 { ; CHECK-NEXT: mov x10, #64 // =0x40 ; CHECK-NEXT: mov x11, #80 // =0x50 ; CHECK-NEXT: mov x12, #32 // =0x20 -; CHECK-NEXT: mov x13, #48 // =0x30 -; CHECK-NEXT: mov x14, #16 // =0x10 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: mov x13, #48 // =0x30 +; CHECK-NEXT: mov x14, #16 // =0x10 ; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll index 11ed69513917c3..0ddf434eff930a 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -15,9 +15,9 @@ target triple = "aarch64-unknown-linux-gnu" define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: sdiv_v8i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 @@ -94,11 +94,11 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0 ; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 @@ -203,7 +203,6 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: sdiv_v128i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ptrue p1.h, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: ptrue p0.s, vl64 @@ -221,7 +220,8 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h -; CHECK-NEXT: st1b { z1.h }, p1, [x0] +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: st1b { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -260,14 +260,14 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h +; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z1.h, p1, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h -; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b ; CHECK-NEXT: splice z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] @@ -284,18 +284,18 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: sdiv_v4i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v4i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: sshll v1.4s, v1.4h, #0 ; VBITS_GE_256-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: sdivr z1.s, p0/m, z1.s, z0.s ; VBITS_GE_256-NEXT: mov w8, v1.s[1] ; VBITS_GE_256-NEXT: mov v0.16b, v1.16b @@ -309,9 +309,9 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: sdiv_v4i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: sshll v1.4s, v1.4h, #0 ; VBITS_GE_512-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: sdivr z1.s, p0/m, z1.s, z0.s ; VBITS_GE_512-NEXT: mov w8, v1.s[1] ; VBITS_GE_512-NEXT: mov v0.16b, v1.16b @@ -329,11 +329,11 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: sdiv_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h @@ -341,9 +341,9 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: sdiv_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s @@ -353,9 +353,9 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: sdiv_v8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s @@ -369,8 +369,8 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define void @sdiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16] ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 @@ -542,8 +542,8 @@ define void @sdiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @sdiv_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z1.s @@ -664,8 +664,8 @@ define void @sdiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @sdiv_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: sdivr z0.d, p0/m, z0.d, z1.d @@ -748,9 +748,9 @@ define void @sdiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: udiv_v8i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 @@ -827,11 +827,11 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: udiv_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0 ; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 @@ -980,14 +980,14 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h +; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z1.h, p1, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h -; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b ; CHECK-NEXT: splice z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] @@ -1004,18 +1004,18 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: udiv_v4i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v4i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: ushll v1.4s, v1.4h, #0 ; VBITS_GE_256-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: udivr z1.s, p0/m, z1.s, z0.s ; VBITS_GE_256-NEXT: mov w8, v1.s[1] ; VBITS_GE_256-NEXT: mov v0.16b, v1.16b @@ -1029,9 +1029,9 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: udiv_v4i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: ushll v1.4s, v1.4h, #0 ; VBITS_GE_512-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: udivr z1.s, p0/m, z1.s, z0.s ; VBITS_GE_512-NEXT: mov w8, v1.s[1] ; VBITS_GE_512-NEXT: mov v0.16b, v1.16b @@ -1049,11 +1049,11 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: udiv_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h @@ -1061,9 +1061,9 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: udiv_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s @@ -1073,9 +1073,9 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: udiv_v8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s @@ -1089,8 +1089,8 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define void @udiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16] ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 @@ -1253,8 +1253,8 @@ define void @udiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @udiv_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z1.s @@ -1375,8 +1375,8 @@ define void @udiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @udiv_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: udivr z0.d, p0/m, z0.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll index 756e5f4cddf809..4feb86305f8f62 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -58,8 +58,8 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) vscale_range(2,0) #0 { define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: sext_v16i8_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -308,8 +308,8 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: sext_v8i16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -472,8 +472,8 @@ define void @sext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: sext_v4i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -554,8 +554,8 @@ define void @sext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: zext_v16i8_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -804,8 +804,8 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: zext_v8i16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -968,8 +968,8 @@ define void @zext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: zext_v4i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll index 38444d83c1d7e4..2d78945399176e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -15,9 +15,9 @@ target triple = "aarch64-unknown-linux-gnu" define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: srem_v8i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v2.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: sshll v3.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 @@ -97,9 +97,9 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: srem_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 @@ -277,8 +277,8 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s ; CHECK-NEXT: ptrue p1.h, vl64 -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h @@ -300,9 +300,9 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: srem_v4i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v2.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sshll v3.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s ; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h @@ -310,9 +310,9 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: srem_v4i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: sshll v2.4s, v1.4h, #0 ; VBITS_GE_256-NEXT: sshll v3.4s, v0.4h, #0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: mov w8, v2.s[1] ; VBITS_GE_256-NEXT: mov v3.16b, v2.16b @@ -326,9 +326,9 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: srem_v4i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: sshll v2.4s, v1.4h, #0 ; VBITS_GE_512-NEXT: sshll v3.4s, v0.4h, #0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_512-NEXT: mov w8, v2.s[1] ; VBITS_GE_512-NEXT: mov v3.16b, v2.16b @@ -346,9 +346,9 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: srem_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v4.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: sshll v3.4s, v1.4h, #0 @@ -359,11 +359,11 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: srem_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h @@ -372,11 +372,11 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: srem_v8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h @@ -389,8 +389,8 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define void @srem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16] ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 @@ -582,25 +582,25 @@ define void @srem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @srem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z4, z1 ; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z0.s +; VBITS_GE_128-NEXT: movprfx z19, z2 +; VBITS_GE_128-NEXT: sdiv z19.s, p0/m, z19.s, z3.s ; VBITS_GE_128-NEXT: movprfx z7, z5 ; VBITS_GE_128-NEXT: sdiv z7.s, p0/m, z7.s, z6.s ; VBITS_GE_128-NEXT: movprfx z18, z16 ; VBITS_GE_128-NEXT: sdiv z18.s, p0/m, z18.s, z17.s -; VBITS_GE_128-NEXT: movprfx z19, z2 -; VBITS_GE_128-NEXT: sdiv z19.s, p0/m, z19.s, z3.s -; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s -; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s ; VBITS_GE_128-NEXT: mls v1.4s, v4.4s, v0.4s ; VBITS_GE_128-NEXT: mls v2.4s, v19.4s, v3.4s -; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] +; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s +; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s ; VBITS_GE_128-NEXT: stp q1, q2, [x0] +; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v16i32: @@ -730,26 +730,26 @@ define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @srem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z4, z1 ; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z0.d +; VBITS_GE_128-NEXT: movprfx z19, z2 +; VBITS_GE_128-NEXT: sdiv z19.d, p0/m, z19.d, z3.d ; VBITS_GE_128-NEXT: movprfx z7, z5 ; VBITS_GE_128-NEXT: sdiv z7.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: movprfx z18, z16 ; VBITS_GE_128-NEXT: sdiv z18.d, p0/m, z18.d, z17.d -; VBITS_GE_128-NEXT: movprfx z19, z2 -; VBITS_GE_128-NEXT: sdiv z19.d, p0/m, z19.d, z3.d -; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d -; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d ; VBITS_GE_128-NEXT: movprfx z1, z2 ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d -; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] +; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d +; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v8i64: @@ -833,9 +833,9 @@ define void @srem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: urem_v8i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v2.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: ushll v3.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 @@ -915,9 +915,9 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: urem_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 @@ -1095,8 +1095,8 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s ; CHECK-NEXT: ptrue p1.h, vl64 -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h @@ -1118,9 +1118,9 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: urem_v4i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v2.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: ushll v3.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s ; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h @@ -1128,9 +1128,9 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: urem_v4i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: ushll v2.4s, v1.4h, #0 ; VBITS_GE_256-NEXT: ushll v3.4s, v0.4h, #0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: mov w8, v2.s[1] ; VBITS_GE_256-NEXT: mov v3.16b, v2.16b @@ -1144,9 +1144,9 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: urem_v4i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: ushll v2.4s, v1.4h, #0 ; VBITS_GE_512-NEXT: ushll v3.4s, v0.4h, #0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_512-NEXT: mov w8, v2.s[1] ; VBITS_GE_512-NEXT: mov v3.16b, v2.16b @@ -1164,9 +1164,9 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: urem_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v4.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: ushll v3.4s, v1.4h, #0 @@ -1177,11 +1177,11 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: urem_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h @@ -1190,11 +1190,11 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: urem_v8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h @@ -1207,8 +1207,8 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define void @urem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16] ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 @@ -1400,25 +1400,25 @@ define void @urem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @urem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z4, z1 ; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z0.s +; VBITS_GE_128-NEXT: movprfx z19, z2 +; VBITS_GE_128-NEXT: udiv z19.s, p0/m, z19.s, z3.s ; VBITS_GE_128-NEXT: movprfx z7, z5 ; VBITS_GE_128-NEXT: udiv z7.s, p0/m, z7.s, z6.s ; VBITS_GE_128-NEXT: movprfx z18, z16 ; VBITS_GE_128-NEXT: udiv z18.s, p0/m, z18.s, z17.s -; VBITS_GE_128-NEXT: movprfx z19, z2 -; VBITS_GE_128-NEXT: udiv z19.s, p0/m, z19.s, z3.s -; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s -; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s ; VBITS_GE_128-NEXT: mls v1.4s, v4.4s, v0.4s ; VBITS_GE_128-NEXT: mls v2.4s, v19.4s, v3.4s -; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] +; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s +; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s ; VBITS_GE_128-NEXT: stp q1, q2, [x0] +; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v16i32: @@ -1548,26 +1548,26 @@ define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @urem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z4, z1 ; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z0.d +; VBITS_GE_128-NEXT: movprfx z19, z2 +; VBITS_GE_128-NEXT: udiv z19.d, p0/m, z19.d, z3.d ; VBITS_GE_128-NEXT: movprfx z7, z5 ; VBITS_GE_128-NEXT: udiv z7.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: movprfx z18, z16 ; VBITS_GE_128-NEXT: udiv z18.d, p0/m, z18.d, z17.d -; VBITS_GE_128-NEXT: movprfx z19, z2 -; VBITS_GE_128-NEXT: udiv z19.d, p0/m, z19.d, z3.d -; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d -; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d ; VBITS_GE_128-NEXT: movprfx z1, z2 ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d -; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] +; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d +; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll index 710dce4de6dda3..37396ba7011be4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll @@ -34,14 +34,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) vscale_ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: mov z0.b, w2 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.b, p1/z, z0.b, #0 -; CHECK-NEXT: sel z0.b, p1, z1.b, z2.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ptrue p1.b, vl32 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1] +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i8>, ptr %a %op2 = load volatile <32 x i8>, ptr %b @@ -53,31 +53,31 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov z0.b, w2 +; VBITS_GE_256-NEXT: ptrue p0.b ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 -; VBITS_GE_256-NEXT: ptrue p1.b -; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1, x8] -; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.b, p1, z1.b, z3.b -; VBITS_GE_256-NEXT: mov z0.b, p1/m, z2.b -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ptrue p1.b, vl32 +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p1/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.b, p0, z0.b, z2.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z3.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p1, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v64i8: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: mov z0.b, w2 -; VBITS_GE_512-NEXT: ptrue p1.b -; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1b { z2.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z0.b, #0 -; VBITS_GE_512-NEXT: sel z0.b, p1, z1.b, z2.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p0.b +; VBITS_GE_512-NEXT: ptrue p1.b, vl64 +; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.b, p0, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <64 x i8>, ptr %a %op2 = load volatile <64 x i8>, ptr %b @@ -89,14 +89,14 @@ define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v128i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: mov z0.b, w2 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.b, p1/z, z0.b, #0 -; CHECK-NEXT: sel z0.b, p1, z1.b, z2.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ptrue p1.b, vl128 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1] +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <128 x i8>, ptr %a %op2 = load volatile <128 x i8>, ptr %b @@ -108,14 +108,14 @@ define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v256i8(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v256i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: mov z0.b, w2 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.b, p1/z, z0.b, #0 -; CHECK-NEXT: sel z0.b, p1, z1.b, z2.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ptrue p1.b, vl256 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1] +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <256 x i8>, ptr %a %op2 = load volatile <256 x i8>, ptr %b @@ -153,15 +153,15 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) vscale_ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl16 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x i16>, ptr %a %op2 = load volatile <16 x i16>, ptr %b @@ -173,33 +173,33 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov z0.h, w2 +; VBITS_GE_256-NEXT: ptrue p0.h ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.h, p1, z1.h, z3.h -; VBITS_GE_256-NEXT: mov z0.h, p1/m, z2.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p1/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.h, p0, z0.h, z2.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v32i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: mov z0.h, w2 -; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: ptrue p0.h +; VBITS_GE_512-NEXT: ptrue p1.h, vl32 ; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; VBITS_GE_512-NEXT: sel z0.h, p1, z1.h, z2.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.h, p0, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <32 x i16>, ptr %a %op2 = load volatile <32 x i16>, ptr %b @@ -211,15 +211,15 @@ define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v64i16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x i16>, ptr %a %op2 = load volatile <64 x i16>, ptr %b @@ -231,15 +231,15 @@ define void @select_v64i16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v128i16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v128i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl128 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <128 x i16>, ptr %a %op2 = load volatile <128 x i16>, ptr %b @@ -277,15 +277,15 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl8 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x i32>, ptr %a %op2 = load volatile <8 x i32>, ptr %b @@ -297,33 +297,33 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: and w8, w2, #0x1 -; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ptrue p0.s ; VBITS_GE_256-NEXT: mov z0.s, w8 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.s, p1, z1.s, z3.s -; VBITS_GE_256-NEXT: mov z0.s, p1/m, z2.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.s, p0, z0.s, z2.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v16i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: and w8, w2, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: mov z0.s, w8 -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; VBITS_GE_512-NEXT: sel z0.s, p1, z1.s, z2.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 +; VBITS_GE_512-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.s, p0, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <16 x i32>, ptr %a %op2 = load volatile <16 x i32>, ptr %b @@ -335,15 +335,15 @@ define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i32>, ptr %a %op2 = load volatile <32 x i32>, ptr %b @@ -355,15 +355,15 @@ define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v64i32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v64i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl64 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x i32>, ptr %a %op2 = load volatile <64 x i32>, ptr %b @@ -401,16 +401,16 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl4 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x i64>, ptr %a %op2 = load volatile <4 x i64>, ptr %b @@ -422,35 +422,35 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_256-NEXT: and x8, x2, #0x1 -; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ptrue p0.d ; VBITS_GE_256-NEXT: mov z0.d, x8 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.d, p1, z1.d, z3.d -; VBITS_GE_256-NEXT: mov z0.d, p1/m, z2.d -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.d, p0, z0.d, z2.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_512-NEXT: and x8, x2, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: mov z0.d, x8 -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_512-NEXT: sel z0.d, p1, z1.d, z2.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.d, p0, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <8 x i64>, ptr %a %op2 = load volatile <8 x i64>, ptr %b @@ -462,16 +462,16 @@ define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x i64>, ptr %a %op2 = load volatile <16 x i64>, ptr %b @@ -483,16 +483,16 @@ define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v32i64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i64>, ptr %a %op2 = load volatile <32 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll index 50040eaa61e6c5..5bb012ae575034 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -131,8 +131,8 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 { define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: ucvtf_v8i16_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -354,7 +354,6 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.s @@ -363,7 +362,8 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v16i32_v16f16: @@ -535,8 +535,8 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 { define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: ucvtf_v4i32_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -759,7 +759,6 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.d @@ -768,7 +767,8 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f32: @@ -1038,8 +1038,8 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 { define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: scvtf_v8i16_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -1273,7 +1273,6 @@ define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.s @@ -1282,7 +1281,8 @@ define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v16i32_v16f16: @@ -1454,8 +1454,8 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 { define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: scvtf_v4i32_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -1684,7 +1684,6 @@ define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.d @@ -1693,7 +1692,8 @@ define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f32: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll index e23151475014da..f2ad98f8caec9b 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll @@ -246,16 +246,16 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [z0.d] ; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [z1.d] -; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i32: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll index 67a53d4e15f3bf..55d37d1bda5e4c 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll @@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu" define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) { ; CHECK-LABEL: masked_load_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll index bdd6ce0647016c..1a19b77f53c67e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll @@ -11,12 +11,12 @@ target triple = "aarch64-unknown-linux-gnu" define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) { ; CHECK-LABEL: masked_store_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: st1b { z1.b }, p0, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask) ret void @@ -27,11 +27,11 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) { ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: shl v0.8h, v0.8h, #15 ; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: st1h { z1.h }, p0, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void @@ -42,11 +42,11 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void @@ -57,11 +57,11 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) { ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: shl v0.2d, v0.2d, #63 ; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z1.d }, p0, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask) ret void diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 92fce4584f6a97..27e95489f8ad7a 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -41,11 +41,11 @@ define void @masked_gather_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: cmeq v0.4h, v0.4h, #0 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1b { z0.d }, p1/z, [z1.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1b { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1b { z0.d }, p0, [x0] ; CHECK-NEXT: ret %cval = load <4 x i8>, ptr %a @@ -65,7 +65,6 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: zip2 v1.8b, v0.8b, v0.8b ; VBITS_GE_256-NEXT: zip1 v0.8b, v0.8b, v0.8b -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8 ; VBITS_GE_256-NEXT: shl v0.4h, v0.4h, #8 ; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8 @@ -75,10 +74,11 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: ld1b { z1.d }, p1/z, [z2.d] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1b { z0.d }, p0/z, [z2.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1b { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z0.d] ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s @@ -93,11 +93,11 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [z1.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [z0.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b @@ -118,11 +118,11 @@ define void @masked_gather_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: cmeq v0.16b, v0.16b, #0 ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1b { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1b { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b @@ -194,10 +194,10 @@ define void @masked_gather_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: cmeq v0.4h, v0.4h, #0 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: str d0, [x0] @@ -219,15 +219,15 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h ; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z2.d] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [z2.d] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s @@ -242,10 +242,10 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: str q0, [x0] @@ -332,9 +332,9 @@ define void @masked_gather_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -354,18 +354,18 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p2.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x1] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_256-NEXT: punpklo p3.h, p1.b ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p1.h, p1.b +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p2.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: and p1.b, p3/z, p3.b, p2.b -; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: ld1w { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: cmpne p1.d, p2/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z2.d] -; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] @@ -460,8 +460,8 @@ define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ldr q0, [x1] @@ -481,9 +481,9 @@ define void @masked_gather_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %cval = load <4 x i64>, ptr %a @@ -500,13 +500,13 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z1.d] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [z1.d] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -515,9 +515,9 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [z1.d] +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret %cval = load <8 x i64>, ptr %a @@ -533,9 +533,9 @@ define void @masked_gather_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %cval = load <16 x i64>, ptr %a @@ -551,9 +551,9 @@ define void @masked_gather_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %cval = load <32 x i64>, ptr %a @@ -603,10 +603,10 @@ define void @masked_gather_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq v0.4h, v0.4h, #0.0 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: str d0, [x0] @@ -626,17 +626,17 @@ define void @masked_gather_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: fcmeq v0.8h, v0.8h, #0.0 -; VBITS_GE_256-NEXT: sunpklo z2.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h ; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [z2.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s @@ -651,10 +651,10 @@ define void @masked_gather_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: fcmeq v0.8h, v0.8h, #0.0 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: str q0, [x0] @@ -741,9 +741,9 @@ define void @masked_gather_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -763,18 +763,18 @@ define void @masked_gather_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p2.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x1] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; VBITS_GE_256-NEXT: punpklo p3.h, p1.b ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p1.h, p1.b +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p2.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: and p1.b, p3/z, p3.b, p2.b -; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: ld1w { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: cmpne p1.d, p2/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z2.d] -; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] @@ -869,8 +869,8 @@ define void @masked_gather_v1f64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @masked_gather_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ldr q0, [x1] @@ -1202,8 +1202,8 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) # ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] -; CHECK-NEXT: punpklo p3.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p3/z, [z0.d] +; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: ld1w { z0.d }, p2/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index 467378e7da59bc..c22d9e71c51a93 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -35,9 +35,9 @@ define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 { define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_load_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -53,9 +53,9 @@ define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 { ; CHECK-LABEL: masked_load_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -401,8 +401,8 @@ define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: ldr q0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 @@ -436,8 +436,8 @@ define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl8 ; VBITS_GE_256-NEXT: ldr d0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.b, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 @@ -504,8 +504,8 @@ define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: ldr q0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 @@ -603,8 +603,8 @@ define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: ldr q0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 @@ -638,8 +638,8 @@ define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl8 ; VBITS_GE_256-NEXT: ldr d0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.b, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 @@ -706,8 +706,8 @@ define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: ldr q0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 @@ -782,11 +782,11 @@ define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p1.b, vl16 -; VBITS_GE_256-NEXT: ptrue p2.b, vl32 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b -; VBITS_GE_256-NEXT: cmpne p1.b, p2/z, z1.b, #0 +; VBITS_GE_256-NEXT: ptrue p1.b, vl32 +; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 @@ -1000,11 +1000,11 @@ define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p1.s, vl4 -; VBITS_GE_256-NEXT: ptrue p2.s, vl8 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s -; VBITS_GE_256-NEXT: cmpne p1.s, p2/z, z1.s, #0 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0] ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 @@ -1041,11 +1041,11 @@ define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p1.b, vl16 -; VBITS_GE_256-NEXT: ptrue p2.b, vl32 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b -; VBITS_GE_256-NEXT: cmpne p1.b, p2/z, z1.b, #0 +; VBITS_GE_256-NEXT: ptrue p1.b, vl32 +; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 @@ -1259,11 +1259,11 @@ define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p1.s, vl4 -; VBITS_GE_256-NEXT: ptrue p2.s, vl8 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s -; VBITS_GE_256-NEXT: cmpne p1.s, p2/z, z1.s, #0 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0] ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index e2d341c22efc26..e3e06dcdf17f30 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -39,12 +39,12 @@ define void @masked_scatter_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: cmeq v1.4h, v0.4h, #0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1b { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1b { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i8>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -61,11 +61,11 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v1.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: zip1 v3.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: zip1 v2.8b, v1.8b, v0.8b ; VBITS_GE_256-NEXT: zip2 v1.8b, v1.8b, v0.8b ; VBITS_GE_256-NEXT: zip2 v0.8b, v0.8b, v0.8b ; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8 ; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -78,10 +78,10 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1b { z3.d }, p1, [z4.d] -; VBITS_GE_256-NEXT: st1b { z0.d }, p0, [z2.d] +; VBITS_GE_256-NEXT: st1b { z3.d }, p1, [z2.d] +; VBITS_GE_256-NEXT: st1b { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_scatter_v8i8: @@ -92,12 +92,12 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [z2.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: st1b { z0.d }, p1, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i8>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -115,12 +115,12 @@ define void @masked_scatter_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1b { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1b { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x i8>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -135,13 +135,13 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: uunpklo z1.h, z0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z0.s, z1.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret @@ -187,10 +187,10 @@ define void @masked_scatter_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1h { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i16>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -208,20 +208,20 @@ define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: cmeq v1.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h ; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z4.d] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z2.d] +; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z2.d] +; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_scatter_v8i16: @@ -232,10 +232,10 @@ define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [z2.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: st1h { z0.d }, p1, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i16>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -253,9 +253,9 @@ define void @masked_scatter_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x i16>, ptr %a @@ -274,9 +274,9 @@ define void @masked_scatter_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x i16>, ptr %a @@ -317,9 +317,9 @@ define void @masked_scatter_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: cmeq v1.4s, v0.4s, #0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1w { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1w { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i32>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -333,21 +333,21 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: punpklo p2.h, p1.b ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p0.h, p0.b +; VBITS_GE_256-NEXT: and p0.b, p0/z, p0.b, p1.b ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z3.d] +; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret ; @@ -434,8 +434,8 @@ define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cmeq v1.2d, v0.2d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ldr q1, [x1] @@ -454,8 +454,8 @@ define void @masked_scatter_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i64>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -484,8 +484,8 @@ define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [z1.d] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i64>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -500,8 +500,8 @@ define void @masked_scatter_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x i64>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -516,8 +516,8 @@ define void @masked_scatter_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x i64>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -539,15 +539,15 @@ define void @masked_scatter_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: mov v0.h[0], v2.h[0] ; CHECK-NEXT: mov w8, v2.s[1] ; CHECK-NEXT: mov v0.h[1], w8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: st1h { z1.d }, p0, [z0.d] +; CHECK-NEXT: uunpklo z0.d, z1.s +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <2 x half>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -565,10 +565,10 @@ define void @masked_scatter_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1h { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x half>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -586,20 +586,20 @@ define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: fcmeq v1.8h, v0.8h, #0.0 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h ; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z4.d] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z2.d] +; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z2.d] +; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_scatter_v8f16: @@ -610,10 +610,10 @@ define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [z2.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: st1h { z0.d }, p1, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x half>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -631,9 +631,9 @@ define void @masked_scatter_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x half>, ptr %a @@ -652,9 +652,9 @@ define void @masked_scatter_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a @@ -695,9 +695,9 @@ define void @masked_scatter_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: fcmeq v1.4s, v0.4s, #0.0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1w { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1w { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x float>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -711,21 +711,21 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: punpklo p2.h, p1.b ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p0.h, p0.b +; VBITS_GE_256-NEXT: and p0.b, p0/z, p0.b, p1.b ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z3.d] +; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret ; @@ -812,8 +812,8 @@ define void @masked_scatter_v1f64(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @masked_scatter_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ldr q1, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll index 68fb4cc6afb093..b0d4f79aea1107 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -34,9 +34,9 @@ define void @masked_store_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 { define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_store_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: fcmeq v1.2s, v0.2s, v1.2s ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -51,9 +51,9 @@ define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 { ; CHECK-LABEL: masked_store_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcmeq v1.4s, v0.4s, v1.4s ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -161,11 +161,11 @@ define void @masked_store_trunc_v8i64i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s -; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: splice z3.s, p0, z3.s, z2.s -; VBITS_GE_256-NEXT: cmpne p0.s, p1/z, z3.s, #0 -; VBITS_GE_256-NEXT: st1b { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z3.s, #0 +; VBITS_GE_256-NEXT: st1b { z1.s }, p1, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i8: @@ -197,14 +197,14 @@ define void @masked_store_trunc_v8i64i16(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p0.s, vl4 -; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s -; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: splice z3.s, p0, z3.s, z2.s ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h @@ -246,11 +246,11 @@ define void @masked_store_trunc_v8i64i32(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s -; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: splice z3.s, p0, z3.s, z2.s -; VBITS_GE_256-NEXT: cmpne p0.s, p1/z, z3.s, #0 -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z3.s, #0 +; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i32: @@ -282,14 +282,14 @@ define void @masked_store_trunc_v16i32i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 -; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b -; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h -; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h ; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b ; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b ; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] @@ -327,17 +327,17 @@ define void @masked_store_trunc_v16i32i16(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 ; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h ; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b -; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b ; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] ; VBITS_GE_256-NEXT: sunpklo z2.h, z3.b -; VBITS_GE_256-NEXT: cmpne p0.h, p1/z, z2.h, #0 +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z2.h, #0 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] ; VBITS_GE_256-NEXT: ret ; @@ -375,11 +375,11 @@ define void @masked_store_trunc_v32i16i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b -; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b ; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b ; VBITS_GE_256-NEXT: splice z3.b, p0, z3.b, z2.b -; VBITS_GE_256-NEXT: cmpne p0.b, p1/z, z3.b, #0 -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z3.b, #0 +; VBITS_GE_256-NEXT: st1b { z1.b }, p1, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_store_trunc_v32i16i8: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index a5303c901b80f3..fb169491b0c909 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -64,7 +64,6 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: mov x10, #8 // =0x8 -; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: mov v2.b[7], w11 ; CHECK-NEXT: mov v1.b[7], w9 ; CHECK-NEXT: uunpklo z3.h, z3.b @@ -86,22 +85,23 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: asr z2.s, z2.s, #31 ; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: and z2.s, z2.s, #0x1 ; CHECK-NEXT: and z1.s, z1.s, #0x1 -; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 ; CHECK-NEXT: cmpne p3.s, p0/z, z2.s, #0 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 -; CHECK-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] -; CHECK-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] -; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0 -; CHECK-NEXT: mov z2.s, p1/m, #0 // =0x0 -; CHECK-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] -; CHECK-NEXT: st1w { z2.s }, p0, [x0] +; CHECK-NEXT: cmpne p4.s, p0/z, z1.s, #0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: mov z3.s, p2/m, #0 // =0x0 +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: mov z2.s, p3/m, #0 // =0x0 +; CHECK-NEXT: mov z1.s, p4/m, #0 // =0x0 +; CHECK-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] +; CHECK-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll index f97ca05f3bdd4b..b633057be139ce 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -34,8 +34,8 @@ define <16 x i8> @splat_v16i8(i8 %a) vscale_range(2,0) #0 { define void @splat_v32i8(i8 %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: mov z0.b, w0 +; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: st1b { z0.b }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <32 x i8> undef, i8 %a, i64 0 @@ -47,8 +47,8 @@ define void @splat_v32i8(i8 %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v64i8(i8 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov z0.b, w0 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8] ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] @@ -56,8 +56,8 @@ define void @splat_v64i8(i8 %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v64i8: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: mov z0.b, w0 +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1] ; VBITS_GE_512-NEXT: ret %insert = insertelement <64 x i8> undef, i8 %a, i64 0 @@ -69,8 +69,8 @@ define void @splat_v64i8(i8 %a, ptr %b) #0 { define void @splat_v128i8(i8 %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v128i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: mov z0.b, w0 +; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: st1b { z0.b }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <128 x i8> undef, i8 %a, i64 0 @@ -82,8 +82,8 @@ define void @splat_v128i8(i8 %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v256i8(i8 %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v256i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: mov z0.b, w0 +; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: st1b { z0.b }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <256 x i8> undef, i8 %a, i64 0 @@ -117,8 +117,8 @@ define <8 x i16> @splat_v8i16(i16 %a) vscale_range(2,0) #0 { define void @splat_v16i16(i16 %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: mov z0.h, w0 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <16 x i16> undef, i16 %a, i64 0 @@ -130,8 +130,8 @@ define void @splat_v16i16(i16 %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v32i16(i16 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov z0.h, w0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] @@ -139,8 +139,8 @@ define void @splat_v32i16(i16 %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v32i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: mov z0.h, w0 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_512-NEXT: ret %insert = insertelement <32 x i16> undef, i16 %a, i64 0 @@ -152,8 +152,8 @@ define void @splat_v32i16(i16 %a, ptr %b) #0 { define void @splat_v64i16(i16 %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov z0.h, w0 +; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <64 x i16> undef, i16 %a, i64 0 @@ -165,8 +165,8 @@ define void @splat_v64i16(i16 %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v128i16(i16 %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v128i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov z0.h, w0 +; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <128 x i16> undef, i16 %a, i64 0 @@ -200,8 +200,8 @@ define <4 x i32> @splat_v4i32(i32 %a) vscale_range(2,0) #0 { define void @splat_v8i32(i32 %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <8 x i32> undef, i32 %a, i64 0 @@ -213,8 +213,8 @@ define void @splat_v8i32(i32 %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v16i32(i32 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, w0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] @@ -222,8 +222,8 @@ define void @splat_v16i32(i32 %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v16i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: mov z0.s, w0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret %insert = insertelement <16 x i32> undef, i32 %a, i64 0 @@ -235,8 +235,8 @@ define void @splat_v16i32(i32 %a, ptr %b) #0 { define void @splat_v32i32(i32 %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <32 x i32> undef, i32 %a, i64 0 @@ -248,8 +248,8 @@ define void @splat_v32i32(i32 %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v64i32(i32 %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v64i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <64 x i32> undef, i32 %a, i64 0 @@ -283,8 +283,8 @@ define <2 x i64> @splat_v2i64(i64 %a) vscale_range(2,0) #0 { define void @splat_v4i64(i64 %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mov z0.d, x0 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <4 x i64> undef, i64 %a, i64 0 @@ -296,8 +296,8 @@ define void @splat_v4i64(i64 %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v8i64(i64 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov z0.d, x0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] @@ -305,8 +305,8 @@ define void @splat_v8i64(i64 %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: mov z0.d, x0 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret %insert = insertelement <8 x i64> undef, i64 %a, i64 0 @@ -318,8 +318,8 @@ define void @splat_v8i64(i64 %a, ptr %b) #0 { define void @splat_v16i64(i64 %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: mov z0.d, x0 +; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <16 x i64> undef, i64 %a, i64 0 @@ -331,8 +331,8 @@ define void @splat_v16i64(i64 %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v32i64(i64 %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: mov z0.d, x0 +; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <32 x i64> undef, i64 %a, i64 0 @@ -372,8 +372,8 @@ define <8 x half> @splat_v8f16(half %a) vscale_range(2,0) #0 { define void @splat_v16f16(half %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -386,8 +386,8 @@ define void @splat_v16f16(half %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v32f16(half %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: mov z0.h, h0 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] @@ -396,8 +396,8 @@ define void @splat_v32f16(half %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v32f16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: mov z0.h, h0 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -410,8 +410,8 @@ define void @splat_v32f16(half %a, ptr %b) #0 { define void @splat_v64f16(half %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v64f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -424,8 +424,8 @@ define void @splat_v64f16(half %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v128f16(half %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v128f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -462,8 +462,8 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) vscale_range(2,0) #0 define void @splat_v8f32(float %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -476,8 +476,8 @@ define void @splat_v8f32(float %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v16f32(float %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: mov z0.s, s0 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -486,8 +486,8 @@ define void @splat_v16f32(float %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: mov z0.s, s0 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -500,8 +500,8 @@ define void @splat_v16f32(float %a, ptr %b) #0 { define void @splat_v32f32(float %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -514,8 +514,8 @@ define void @splat_v32f32(float %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v64f32(float %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -550,8 +550,8 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) vscale_range(2,0) define void @splat_v4f64(double %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -564,8 +564,8 @@ define void @splat_v4f64(double %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v8f64(double %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: mov z0.d, d0 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] @@ -574,8 +574,8 @@ define void @splat_v8f64(double %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: mov z0.d, d0 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -588,8 +588,8 @@ define void @splat_v8f64(double %a, ptr %b) #0 { define void @splat_v16f64(double %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -602,8 +602,8 @@ define void @splat_v16f64(double %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v32f64(double %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -620,8 +620,8 @@ define void @splat_v32f64(double %a, ptr %b) vscale_range(16,0) #0 { define void @splat_imm_v64i8(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: mov z0.b, #1 // =0x1 +; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <64 x i8> undef, i8 1, i64 0 @@ -633,8 +633,8 @@ define void @splat_imm_v64i8(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v32i16(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: mov z0.h, #2 // =0x2 +; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <32 x i16> undef, i16 2, i64 0 @@ -646,8 +646,8 @@ define void @splat_imm_v32i16(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v16i32(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: mov z0.s, #3 // =0x3 +; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <16 x i32> undef, i32 3, i64 0 @@ -659,8 +659,8 @@ define void @splat_imm_v16i32(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v8i64(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: mov z0.d, #4 // =0x4 +; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <8 x i64> undef, i64 4, i64 0 @@ -676,8 +676,8 @@ define void @splat_imm_v8i64(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v32f16(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v32f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: fmov z0.h, #5.00000000 +; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <32 x half> undef, half 5.0, i64 0 @@ -689,8 +689,8 @@ define void @splat_imm_v32f16(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v16f32(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: fmov z0.s, #6.00000000 +; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <16 x float> undef, float 6.0, i64 0 @@ -702,8 +702,8 @@ define void @splat_imm_v16f32(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v8f64(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: fmov z0.d, #7.00000000 +; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <8 x double> undef, double 7.0, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll index 2dc4bddb81a6db..020d5cb53bf21e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll @@ -36,14 +36,14 @@ define void @store_trunc_v8i64i8(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1b { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1b { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: store_trunc_v8i64i8: @@ -117,14 +117,14 @@ define void @store_trunc_v8i64i32(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: store_trunc_v8i64i32: @@ -172,14 +172,14 @@ define void @store_trunc_v16i32i16(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: store_trunc_v16i32i16: @@ -199,14 +199,14 @@ define void @store_trunc_v32i16i8(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ptrue p1.b, vl32 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b -; VBITS_GE_256-NEXT: st1b { z1.b }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: store_trunc_v32i16i8: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll index 68c234a20d1108..28094c7b68e7c1 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll @@ -559,13 +559,13 @@ define <8 x i16> @shuffle_index_indices_from_both_ops_i16(ptr %a, ptr %b) { ; ; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops_i16: ; SVE2_128_NOMAX: // %bb.0: -; SVE2_128_NOMAX-NEXT: ptrue p0.h, vl8 ; SVE2_128_NOMAX-NEXT: cnth x8 ; SVE2_128_NOMAX-NEXT: adrp x9, .LCPI7_0 ; SVE2_128_NOMAX-NEXT: adrp x10, .LCPI7_1 ; SVE2_128_NOMAX-NEXT: mov z0.h, w8 ; SVE2_128_NOMAX-NEXT: ldr q1, [x9, :lo12:.LCPI7_0] ; SVE2_128_NOMAX-NEXT: ldr q2, [x10, :lo12:.LCPI7_1] +; SVE2_128_NOMAX-NEXT: ptrue p0.h, vl8 ; SVE2_128_NOMAX-NEXT: mad z0.h, p0/m, z1.h, z2.h ; SVE2_128_NOMAX-NEXT: ldr q1, [x0] ; SVE2_128_NOMAX-NEXT: ldr q2, [x1] @@ -575,13 +575,13 @@ define <8 x i16> @shuffle_index_indices_from_both_ops_i16(ptr %a, ptr %b) { ; ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops_i16: ; SVE2_NOMIN_NOMAX: // %bb.0: -; SVE2_NOMIN_NOMAX-NEXT: ptrue p0.h, vl8 ; SVE2_NOMIN_NOMAX-NEXT: cnth x8 ; SVE2_NOMIN_NOMAX-NEXT: adrp x9, .LCPI7_0 ; SVE2_NOMIN_NOMAX-NEXT: adrp x10, .LCPI7_1 ; SVE2_NOMIN_NOMAX-NEXT: mov z0.h, w8 ; SVE2_NOMIN_NOMAX-NEXT: ldr q1, [x9, :lo12:.LCPI7_0] ; SVE2_NOMIN_NOMAX-NEXT: ldr q2, [x10, :lo12:.LCPI7_1] +; SVE2_NOMIN_NOMAX-NEXT: ptrue p0.h, vl8 ; SVE2_NOMIN_NOMAX-NEXT: mad z0.h, p0/m, z1.h, z2.h ; SVE2_NOMIN_NOMAX-NEXT: ldr q1, [x0] ; SVE2_NOMIN_NOMAX-NEXT: ldr q2, [x1] @@ -597,9 +597,9 @@ define <8 x i16> @shuffle_index_indices_from_both_ops_i16(ptr %a, ptr %b) { ; SVE2_MIN_256_NOMAX-NEXT: adrp x9, .LCPI7_1 ; SVE2_MIN_256_NOMAX-NEXT: add x9, x9, :lo12:.LCPI7_1 ; SVE2_MIN_256_NOMAX-NEXT: cnth x10 -; SVE2_MIN_256_NOMAX-NEXT: mov z2.h, w10 ; SVE2_MIN_256_NOMAX-NEXT: ld1h { z0.h }, p0/z, [x8] ; SVE2_MIN_256_NOMAX-NEXT: ld1h { z1.h }, p0/z, [x9] +; SVE2_MIN_256_NOMAX-NEXT: mov z2.h, w10 ; SVE2_MIN_256_NOMAX-NEXT: mad z0.h, p0/m, z2.h, z1.h ; SVE2_MIN_256_NOMAX-NEXT: ldr q1, [x0] ; SVE2_MIN_256_NOMAX-NEXT: ldr q2, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll index 5ff9f0f0df62f8..afe13851f0b953 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll @@ -7,11 +7,11 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, %i37, < ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #3745 // =0xea1 ; CHECK-NEXT: movk w8, #16618, lsl #16 +; CHECK-NEXT: ld1w { z3.d }, p0/z, [x0] ; CHECK-NEXT: mov z4.s, w8 ; CHECK-NEXT: mov w8, #57344 // =0xe000 ; CHECK-NEXT: movk w8, #17535, lsl #16 ; CHECK-NEXT: mov z5.s, w8 -; CHECK-NEXT: ld1w { z3.d }, p0/z, [x0] ; CHECK-NEXT: fmul z4.s, p0/m, z4.s, z3.s ; CHECK-NEXT: fadd z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: mov z5.d, #1023 // =0x3ff diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll index aefc8de431436a..6420071b3dce44 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll @@ -92,8 +92,8 @@ define @fsqrt_recip_8f16( %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte z1.h, z0.h ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: fmul z2.h, z1.h, z1.h ; CHECK-NEXT: fcmne p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: fmul z2.h, z1.h, z1.h ; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h ; CHECK-NEXT: fmul z1.h, z1.h, z2.h ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h @@ -117,8 +117,8 @@ define @fsqrt_recip_4f32( %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte z1.s, z0.s ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: fmul z2.s, z1.s, z1.s ; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fmul z2.s, z1.s, z1.s ; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s ; CHECK-NEXT: fmul z1.s, z1.s, z2.s ; CHECK-NEXT: fmul z2.s, z1.s, z1.s @@ -145,8 +145,8 @@ define @fsqrt_recip_2f64( %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte z1.d, z0.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmul z2.d, z1.d, z1.d ; CHECK-NEXT: fcmne p0.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: fmul z2.d, z1.d, z1.d ; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d ; CHECK-NEXT: fmul z1.d, z1.d, z2.d ; CHECK-NEXT: fmul z2.d, z1.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll index 460d8a8694bc4a..1a2ab8d4253abc 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll @@ -51,10 +51,10 @@ define half @fadda_nxv6f16( %v, half %s) { ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #32768 // =0x8000 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: fmov s0, s1 ; CHECK-NEXT: st1h { z2.d }, p1, [sp, #3, mul vl] diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll index 813f1601e809e6..584c29ebcfc048 100644 --- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll @@ -14,8 +14,8 @@ declare @llvm.fptosi.sat.nxv4f32.nxv4i64( define @test_signed_v2f32_v2i32( %f) { ; CHECK-LABEL: test_signed_v2f32_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff80000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff @@ -38,8 +38,8 @@ define @test_signed_v2f32_v2i32( %f) { define @test_signed_v4f32_v4i32( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0x80000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff @@ -67,29 +67,29 @@ define @test_signed_v8f32_v8i32( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 -; CHECK-NEXT: mov z3.s, #0x80000000 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z6.s, #0x7fffffff ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff -; CHECK-NEXT: mov z6.s, #0x7fffffff -; CHECK-NEXT: mov z4.s, w8 -; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z2.s -; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z2.s -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzs z2.s, p0/m, z0.s +; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: fcvtzs z4.s, p0/m, z0.s ; CHECK-NEXT: movprfx z5, z1 ; CHECK-NEXT: fcvtzs z5.s, p0/m, z1.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z4.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z1.s, z4.s +; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z2.s +; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z2.s +; CHECK-NEXT: mov z2.s, #0x80000000 +; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z3.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z1.s, z3.s ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.s, p1/m, z3.s +; CHECK-NEXT: sel z3.s, p1, z2.s, z4.s ; CHECK-NEXT: fcmuo p1.s, p0/z, z0.s, z0.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z1.s, z1.s -; CHECK-NEXT: sel z3.s, p2, z3.s, z5.s -; CHECK-NEXT: sel z0.s, p3, z6.s, z2.s -; CHECK-NEXT: sel z1.s, p4, z6.s, z3.s +; CHECK-NEXT: sel z2.s, p2, z2.s, z5.s +; CHECK-NEXT: sel z0.s, p3, z6.s, z3.s +; CHECK-NEXT: sel z1.s, p4, z6.s, z2.s ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.s, p0/m, #0 // =0x0 @@ -103,8 +103,8 @@ define @test_signed_v8f32_v8i32( %f) { define @test_signed_v4f32_v4i16( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #-956301312 // =0xc7000000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #65024 // =0xfe00 ; CHECK-NEXT: movk w8, #18175, lsl #16 @@ -132,28 +132,28 @@ define @test_signed_v8f32_v8i16( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #-956301312 // =0xc7000000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z5.s, #32767 // =0x7fff ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: mov w8, #65024 // =0xfe00 ; CHECK-NEXT: movk w8, #18175, lsl #16 -; CHECK-NEXT: mov z3.s, w8 -; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s -; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.s, p0/m, z1.s +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: fcvtzs z3.s, p0/m, z1.s ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: fcvtzs z4.s, p0/m, z0.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z3.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z3.s +; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s +; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z2.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z2.s ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.s, p1/m, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: mov z3.s, p1/m, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s ; CHECK-NEXT: mov z4.s, p2/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: sel z0.s, p3, z5.s, z2.s +; CHECK-NEXT: sel z0.s, p3, z5.s, z3.s ; CHECK-NEXT: sel z1.s, p4, z5.s, z4.s ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 @@ -169,8 +169,8 @@ define @test_signed_v8f32_v8i16( %f) { define @test_signed_v2f32_v2i64( %f) { ; CHECK-LABEL: test_signed_v2f32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff @@ -198,31 +198,31 @@ define @test_signed_v4f32_v4i64( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff -; CHECK-NEXT: mov z3.d, #0x8000000000000000 -; CHECK-NEXT: mov z4.s, w8 +; CHECK-NEXT: mov z3.s, w8 ; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff ; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s ; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z1.s +; CHECK-NEXT: mov z2.d, #0x8000000000000000 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.s ; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z4.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z4.s +; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z3.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z3.s ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.d, p1/m, z3.d +; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d ; CHECK-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s -; CHECK-NEXT: sel z3.d, p2, z3.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z2.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z3.d +; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d +; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d +; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -246,8 +246,8 @@ declare @llvm.fptosi.sat.nxv4f64.nxv4i64( @test_signed_v2f64_v2i32( %f) { ; CHECK-LABEL: test_signed_v2f64_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4476578029606273024 // =0xc1e0000000000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff80000000 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000 @@ -276,30 +276,30 @@ define @test_signed_v4f64_v4i32( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4476578029606273024 // =0xc1e0000000000000 -; CHECK-NEXT: mov z3.d, #0xffffffff80000000 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z6.d, #0x7fffffff ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000 -; CHECK-NEXT: mov z6.d, #0x7fffffff ; CHECK-NEXT: movk x8, #16863, lsl #48 -; CHECK-NEXT: mov z4.d, x8 -; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d -; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z1.d +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.d ; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z4.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z4.d +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d +; CHECK-NEXT: mov z2.d, #0xffffffff80000000 +; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z3.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z3.d ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.d, p1/m, z3.d +; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z3.d, p2, z3.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z2.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z3.d +; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d +; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d +; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -322,48 +322,48 @@ define @test_signed_v8f64_v8i32( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4476578029606273024 // =0xc1e0000000000000 -; CHECK-NEXT: mov z26.d, #0x7fffffff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z5.d, #0xffffffff80000000 ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000 +; CHECK-NEXT: mov z26.d, #0x7fffffff ; CHECK-NEXT: movk x8, #16863, lsl #48 -; CHECK-NEXT: mov z5.d, x8 -; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z4.d -; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z4.d -; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, z4.d -; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, z4.d -; CHECK-NEXT: mov z4.d, #0xffffffff80000000 -; CHECK-NEXT: movprfx z6, z1 -; CHECK-NEXT: fcvtzs z6.d, p0/m, z1.d ; CHECK-NEXT: movprfx z7, z0 ; CHECK-NEXT: fcvtzs z7.d, p0/m, z0.d ; CHECK-NEXT: movprfx z24, z3 ; CHECK-NEXT: fcvtzs z24.d, p0/m, z3.d +; CHECK-NEXT: mov z6.d, x8 ; CHECK-NEXT: movprfx z25, z2 ; CHECK-NEXT: fcvtzs z25.d, p0/m, z2.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z1.d, z5.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z0.d, z5.d -; CHECK-NEXT: fcmgt p7.d, p0/z, z3.d, z5.d +; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z4.d +; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z4.d +; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, z4.d +; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, z4.d +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.d +; CHECK-NEXT: fcmgt p5.d, p0/z, z1.d, z6.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z0.d, z6.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z3.d, z6.d ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b ; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: mov z4.d, p1/m, z5.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z2.d, z6.d ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z6.d, p1/m, z4.d -; CHECK-NEXT: fcmgt p1.d, p0/z, z2.d, z5.d -; CHECK-NEXT: sel z5.d, p2, z4.d, z7.d +; CHECK-NEXT: sel z6.d, p2, z5.d, z7.d ; CHECK-NEXT: fcmuo p2.d, p0/z, z1.d, z1.d -; CHECK-NEXT: sel z7.d, p3, z4.d, z24.d +; CHECK-NEXT: sel z7.d, p3, z5.d, z24.d ; CHECK-NEXT: fcmuo p3.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z4.d, p4, z4.d, z25.d +; CHECK-NEXT: sel z5.d, p4, z5.d, z25.d ; CHECK-NEXT: fcmuo p4.d, p0/z, z3.d, z3.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z2.d, z2.d -; CHECK-NEXT: sel z0.d, p5, z26.d, z6.d -; CHECK-NEXT: sel z1.d, p6, z26.d, z5.d +; CHECK-NEXT: sel z0.d, p5, z26.d, z4.d +; CHECK-NEXT: sel z1.d, p6, z26.d, z6.d ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: sel z2.d, p7, z26.d, z7.d ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z26.d, z4.d +; CHECK-NEXT: sel z3.d, p1, z26.d, z5.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 @@ -387,28 +387,28 @@ define @test_signed_v4f64_v4i16( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4548635623644200960 // =0xc0e0000000000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z5.d, #32767 // =0x7fff ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #281200098803712 // =0xffc000000000 ; CHECK-NEXT: movk x8, #16607, lsl #48 -; CHECK-NEXT: mov z3.d, x8 -; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d -; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z1.d +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: fcvtzs z3.d, p0/m, z1.d ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z0.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z3.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z3.d +; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z2.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z2.d ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.d, p1/m, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: mov z3.d, p1/m, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: fcmuo p1.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d ; CHECK-NEXT: mov z4.d, p2/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: sel z0.d, p3, z5.d, z2.d +; CHECK-NEXT: sel z0.d, p3, z5.d, z3.d ; CHECK-NEXT: sel z1.d, p4, z5.d, z4.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 @@ -432,34 +432,34 @@ define @test_signed_v8f64_v8i16( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4548635623644200960 // =0xc0e0000000000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, #32767 // =0x7fff ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: mov x8, #281200098803712 // =0xffc000000000 ; CHECK-NEXT: movk x8, #16607, lsl #48 -; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, z4.d -; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z4.d -; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, z4.d -; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, z4.d -; CHECK-NEXT: movprfx z5, z3 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z3.d -; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: movprfx z6, z2 ; CHECK-NEXT: fcvtzs z6.d, p0/m, z2.d ; CHECK-NEXT: movprfx z7, z1 ; CHECK-NEXT: fcvtzs z7.d, p0/m, z1.d +; CHECK-NEXT: mov z5.d, x8 ; CHECK-NEXT: movprfx z24, z0 ; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.d +; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, z4.d +; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z4.d +; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, z4.d +; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, z4.d +; CHECK-NEXT: movprfx z4, z3 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z3.d +; CHECK-NEXT: fcmgt p5.d, p0/z, z3.d, z5.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z5.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z1.d, z5.d ; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p5.d, p0/z, z3.d, z4.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z4.d ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p7.d, p0/z, z1.d, z4.d ; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: mov z4.d, p1/m, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: fcmgt p1.d, p0/z, z0.d, z5.d ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z5.d, p1/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: fcmgt p1.d, p0/z, z0.d, z4.d ; CHECK-NEXT: mov z6.d, p2/m, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: fcmuo p2.d, p0/z, z3.d, z3.d ; CHECK-NEXT: mov z7.d, p3/m, #-32768 // =0xffffffffffff8000 @@ -467,7 +467,7 @@ define @test_signed_v8f64_v8i16( %f) { ; CHECK-NEXT: mov z24.d, p4/m, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: fcmuo p4.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z2.d, p5, z25.d, z5.d +; CHECK-NEXT: sel z2.d, p5, z25.d, z4.d ; CHECK-NEXT: sel z0.d, p6, z25.d, z6.d ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: sel z1.d, p7, z25.d, z7.d @@ -492,8 +492,8 @@ define @test_signed_v8f64_v8i16( %f) { define @test_signed_v2f64_v2i64( %f) { ; CHECK-LABEL: test_signed_v2f64_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff @@ -521,29 +521,29 @@ define @test_signed_v4f64_v4i64( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: mov z3.d, #0x8000000000000000 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff -; CHECK-NEXT: mov z4.d, x8 -; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z2.d -; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z2.d -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.d +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z0.d ; CHECK-NEXT: movprfx z5, z1 ; CHECK-NEXT: fcvtzs z5.d, p0/m, z1.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z4.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z1.d, z4.d +; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z2.d +; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z2.d +; CHECK-NEXT: mov z2.d, #0x8000000000000000 +; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z3.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z1.d, z3.d ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.d, p1/m, z3.d +; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z0.d, z0.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z1.d, z1.d -; CHECK-NEXT: sel z3.d, p2, z3.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z2.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z3.d +; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d +; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d +; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -568,8 +568,8 @@ declare @llvm.fptosi.sat.nxv4f16.nxv4i64() define @test_signed_v2f16_v2i32( %f) { ; CHECK-LABEL: test_signed_v2f16_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #64511 // =0xfbff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff80000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff @@ -592,8 +592,8 @@ define @test_signed_v2f16_v2i32( %f) { define @test_signed_v4f16_v4i32( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #64511 // =0xfbff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0x80000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff @@ -621,31 +621,31 @@ define @test_signed_v8f16_v8i32( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #64511 // =0xfbff ; CHECK-NEXT: uunpklo z1.s, z0.h ; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z3.s, #0x80000000 -; CHECK-NEXT: mov z4.h, w8 +; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: mov z6.s, #0x7fffffff ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h ; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.s, p0/m, z1.h +; CHECK-NEXT: mov z2.s, #0x80000000 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: fcvtzs z4.s, p0/m, z1.h ; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: fcvtzs z5.s, p0/m, z0.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z4.h -; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z4.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z3.h +; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z3.h ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.s, p1/m, z3.s +; CHECK-NEXT: sel z3.s, p1, z2.s, z4.s ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z1.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h -; CHECK-NEXT: sel z3.s, p2, z3.s, z5.s -; CHECK-NEXT: sel z0.s, p3, z6.s, z2.s -; CHECK-NEXT: sel z1.s, p4, z6.s, z3.s +; CHECK-NEXT: sel z2.s, p2, z2.s, z5.s +; CHECK-NEXT: sel z0.s, p3, z6.s, z3.s +; CHECK-NEXT: sel z1.s, p4, z6.s, z2.s ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.s, p0/m, #0 // =0x0 @@ -659,8 +659,8 @@ define @test_signed_v8f16_v8i32( %f) { define @test_signed_v4f16_v4i16( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #63488 // =0xf800 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #30719 // =0x77ff ; CHECK-NEXT: mov z2.h, w8 @@ -682,8 +682,8 @@ define @test_signed_v4f16_v4i16( %f) { define @test_signed_v8f16_v8i16( %f) { ; CHECK-LABEL: test_signed_v8f16_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #63488 // =0xf800 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #30719 // =0x77ff ; CHECK-NEXT: mov z2.h, w8 @@ -705,8 +705,8 @@ define @test_signed_v8f16_v8i16( %f) { define @test_signed_v2f16_v2i64( %f) { ; CHECK-LABEL: test_signed_v2f16_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #64511 // =0xfbff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff @@ -734,31 +734,31 @@ define @test_signed_v4f16_v4i64( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #64511 // =0xfbff ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z3.d, #0x8000000000000000 -; CHECK-NEXT: mov z4.h, w8 +; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h ; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z1.h +; CHECK-NEXT: mov z2.d, #0x8000000000000000 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.h ; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z4.h -; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z4.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z3.h +; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z3.h ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.d, p1/m, z3.d +; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z1.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h -; CHECK-NEXT: sel z3.d, p2, z3.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z2.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z3.d +; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d +; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d +; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 diff --git a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll index c56c0b37888dc0..ed352ffec339f1 100644 --- a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll @@ -82,8 +82,8 @@ define @test_signed_v4f32_v4i16( %f) { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #65280 // =0xff00 ; CHECK-NEXT: movk w8, #18303, lsl #16 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: fcvtzu z2.s, p0/m, z0.s ; CHECK-NEXT: not p1.b, p0/z, p1.b @@ -102,9 +102,9 @@ define @test_signed_v8f32_v8i16( %f) { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #65280 // =0xff00 ; CHECK-NEXT: movk w8, #18303, lsl #16 -; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzu z3.s, p0/m, z1.s ; CHECK-NEXT: movprfx z4, z0 @@ -146,10 +146,10 @@ define @test_signed_v2f32_v2i64( %f) { define @test_signed_v4f32_v4i64( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: uunpkhi z3.d, z0.s ; CHECK-NEXT: mov w8, #1602224127 // =0x5f7fffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z4.s, w8 ; CHECK-NEXT: fcmge p1.s, p0/z, z2.s, #0.0 ; CHECK-NEXT: fcmge p2.s, p0/z, z3.s, #0.0 @@ -186,8 +186,8 @@ define @test_signed_v2f64_v2i32( %f) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000 ; CHECK-NEXT: movk x8, #16879, lsl #48 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: fcvtzu z2.d, p0/m, z0.d ; CHECK-NEXT: not p1.b, p0/z, p1.b @@ -206,9 +206,9 @@ define @test_signed_v4f64_v4i32( %f) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000 ; CHECK-NEXT: movk x8, #16879, lsl #48 -; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 ; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzu z3.d, p0/m, z1.d ; CHECK-NEXT: movprfx z4, z0 @@ -241,28 +241,28 @@ define @test_signed_v8f64_v8i32( %f) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000 ; CHECK-NEXT: movk x8, #16879, lsl #48 -; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 ; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 ; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, #0.0 ; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, #0.0 ; CHECK-NEXT: movprfx z5, z1 ; CHECK-NEXT: fcvtzu z5.d, p0/m, z1.d +; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: movprfx z6, z0 ; CHECK-NEXT: fcvtzu z6.d, p0/m, z0.d ; CHECK-NEXT: movprfx z7, z3 ; CHECK-NEXT: fcvtzu z7.d, p0/m, z3.d ; CHECK-NEXT: movprfx z24, z2 ; CHECK-NEXT: fcvtzu z24.d, p0/m, z2.d +; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmgt p5.d, p0/z, z1.d, z4.d ; CHECK-NEXT: fcmgt p6.d, p0/z, z0.d, z4.d -; CHECK-NEXT: mov z0.d, #0xffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b +; CHECK-NEXT: mov z0.d, #0xffffffff ; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p4.b, p0/z, p4.b ; CHECK-NEXT: mov z5.d, p1/m, #0 // =0x0 ; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z4.d +; CHECK-NEXT: not p4.b, p0/z, p4.b ; CHECK-NEXT: fcmgt p0.d, p0/z, z2.d, z4.d ; CHECK-NEXT: mov z6.d, p2/m, #0 // =0x0 ; CHECK-NEXT: mov z7.d, p3/m, #0 // =0x0 @@ -289,9 +289,9 @@ define @test_signed_v4f64_v4i16( %f) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281337537757184 // =0xffe000000000 ; CHECK-NEXT: movk x8, #16623, lsl #48 -; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 ; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzu z3.d, p0/m, z1.d ; CHECK-NEXT: movprfx z4, z0 @@ -324,28 +324,28 @@ define @test_signed_v8f64_v8i16( %f) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281337537757184 // =0xffe000000000 ; CHECK-NEXT: movk x8, #16623, lsl #48 -; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, #0.0 ; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, #0.0 ; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, #0.0 ; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, #0.0 ; CHECK-NEXT: movprfx z5, z3 ; CHECK-NEXT: fcvtzu z5.d, p0/m, z3.d +; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: movprfx z6, z2 ; CHECK-NEXT: fcvtzu z6.d, p0/m, z2.d ; CHECK-NEXT: movprfx z7, z1 ; CHECK-NEXT: fcvtzu z7.d, p0/m, z1.d ; CHECK-NEXT: movprfx z24, z0 ; CHECK-NEXT: fcvtzu z24.d, p0/m, z0.d +; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmgt p5.d, p0/z, z3.d, z4.d ; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z4.d -; CHECK-NEXT: mov z2.d, #65535 // =0xffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b +; CHECK-NEXT: mov z2.d, #65535 // =0xffff ; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p4.b, p0/z, p4.b ; CHECK-NEXT: mov z5.d, p1/m, #0 // =0x0 ; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z4.d +; CHECK-NEXT: not p4.b, p0/z, p4.b ; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d ; CHECK-NEXT: mov z6.d, p2/m, #0 // =0x0 ; CHECK-NEXT: mov z7.d, p3/m, #0 // =0x0 @@ -465,10 +465,10 @@ define @test_signed_v4f16_v4i32( %f) { define @test_signed_v8f16_v8i32( %f) { ; CHECK-LABEL: test_signed_v8f16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: uunpkhi z3.s, z0.h ; CHECK-NEXT: mov w8, #31743 // =0x7bff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z4.h, w8 ; CHECK-NEXT: fcmge p1.h, p0/z, z2.h, #0.0 ; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, #0.0 @@ -549,10 +549,10 @@ define @test_signed_v2f16_v2i64( %f) { define @test_signed_v4f16_v4i64( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: uunpkhi z3.d, z0.s ; CHECK-NEXT: mov w8, #31743 // =0x7bff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z4.h, w8 ; CHECK-NEXT: fcmge p1.h, p0/z, z2.h, #0.0 ; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, #0.0 diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll index 5c4c9463528b87..ad6371f78ec088 100644 --- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll +++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll @@ -71,12 +71,12 @@ define @gather_i8_index_offset_8(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_f16_index_offset_var: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z2.d, x1 -; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: uunpklo z3.d, z0.s ; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: movprfx z4, z2 ; CHECK-NEXT: mla z4.d, p1/m, z1.d, z2.d @@ -101,16 +101,16 @@ define void @scatter_f16_index_offset_var(ptr %base, i64 %offset, i64 %scale, %pg, %data) #0 { ; CHECK-LABEL: scatter_i8_index_offset_maximum_plus_one: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov w8, #33554432 // =0x2000000 ; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: index z1.d, #0, x8 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: add x8, x0, x1 +; CHECK-NEXT: index z1.d, #0, x8 +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: lsr x9, x9, #4 +; CHECK-NEXT: add x8, x0, x1 ; CHECK-NEXT: mov w10, #67108864 // =0x4000000 -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1b { z2.d }, p1, [x8, z1.d] ; CHECK-NEXT: madd x8, x9, x10, x8 ; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] @@ -131,17 +131,17 @@ define void @scatter_i8_index_offset_maximum_plus_one(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_i8_index_offset_minimum_minus_one: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov x8, #-33554433 // =0xfffffffffdffffff ; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: index z1.d, #0, x8 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: mov x10, #-2 // =0xfffffffffffffffe +; CHECK-NEXT: index z1.d, #0, x8 +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: lsr x9, x9, #4 +; CHECK-NEXT: mov x10, #-2 // =0xfffffffffffffffe ; CHECK-NEXT: add x8, x0, x1 +; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: movk x10, #64511, lsl #16 ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: st1b { z2.d }, p1, [x8, z1.d] ; CHECK-NEXT: madd x8, x9, x10, x8 ; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] @@ -162,16 +162,16 @@ define void @scatter_i8_index_offset_minimum_minus_one(ptr %base, i64 %offset, < define void @scatter_i8_index_stride_too_big(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_i8_index_stride_too_big: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov x8, #4611686018427387904 // =0x4000000000000000 ; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: index z1.d, #0, x8 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: add x8, x0, x1 +; CHECK-NEXT: index z1.d, #0, x8 +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: lsr x9, x9, #4 +; CHECK-NEXT: add x8, x0, x1 ; CHECK-NEXT: mov x10, #-9223372036854775808 // =0x8000000000000000 -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1b { z2.d }, p1, [x8, z1.d] ; CHECK-NEXT: madd x8, x9, x10, x8 ; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] diff --git a/llvm/test/CodeGen/AArch64/sve-hadd.ll b/llvm/test/CodeGen/AArch64/sve-hadd.ll index c73370d50287bf..f90aef8daa5dc4 100644 --- a/llvm/test/CodeGen/AArch64/sve-hadd.ll +++ b/llvm/test/CodeGen/AArch64/sve-hadd.ll @@ -129,9 +129,9 @@ define @haddu_v2i32( %s0, @haddu_v2i16( %s0, @haddu_v4i16( %s0, @haddu_v4i8( %s0, %s ; ; SVE2-LABEL: haddu_v4i8: ; SVE2: // %bb.0: // %entry -; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: and z0.s, z0.s, #0xff ; SVE2-NEXT: and z1.s, z1.s, #0xff +; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: uhadd z0.s, p0/m, z0.s, z1.s ; SVE2-NEXT: ret entry: @@ -557,9 +557,9 @@ define @haddu_v8i8( %s0, %s ; ; SVE2-LABEL: haddu_v8i8: ; SVE2: // %bb.0: // %entry -; SVE2-NEXT: ptrue p0.h ; SVE2-NEXT: and z0.h, z0.h, #0xff ; SVE2-NEXT: and z1.h, z1.h, #0xff +; SVE2-NEXT: ptrue p0.h ; SVE2-NEXT: uhadd z0.h, p0/m, z0.h, z1.h ; SVE2-NEXT: ret entry: @@ -787,9 +787,9 @@ define @rhaddu_v2i32( %s0, @rhaddu_v2i16( %s0, @rhaddu_v4i16( %s0, @rhaddu_v4i8( %s0, % ; ; SVE2-LABEL: rhaddu_v4i8: ; SVE2: // %bb.0: // %entry -; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: and z0.s, z0.s, #0xff ; SVE2-NEXT: and z1.s, z1.s, #0xff +; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: urhadd z0.s, p0/m, z0.s, z1.s ; SVE2-NEXT: ret entry: @@ -1241,9 +1241,9 @@ define @rhaddu_v8i8( %s0, % ; ; SVE2-LABEL: rhaddu_v8i8: ; SVE2: // %bb.0: // %entry -; SVE2-NEXT: ptrue p0.h ; SVE2-NEXT: and z0.h, z0.h, #0xff ; SVE2-NEXT: and z1.h, z1.h, #0xff +; SVE2-NEXT: ptrue p0.h ; SVE2-NEXT: urhadd z0.h, p0/m, z0.h, z1.h ; SVE2-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll index e20399de70bf83..73bbee094827e7 100644 --- a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll +++ b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll @@ -175,14 +175,14 @@ define @uminv_zero_fill( %pg, @zero_fill_non_zero_index( %pg, %a) #0 { ; CHECK-LABEL: zero_fill_non_zero_index: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: uminv d3, p0, z0.d ; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fmov x8, d3 ; CHECK-NEXT: cmpeq p0.d, p1/z, z1.d, z2.d +; CHECK-NEXT: fmov x8, d3 ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret %t1 = call i64 @llvm.aarch64.sve.uminv.nxv2i64( %pg, %a) @@ -210,11 +210,11 @@ define @zero_fill_type_mismatch( %pg, @zero_fill_no_zero_upper_lanes( %pg, %a) #0 { ; CHECK-LABEL: zero_fill_no_zero_upper_lanes: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d, vl1 ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z0.d ; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: mov z1.d, p1/m, x8 +; CHECK-NEXT: mov z1.d, p0/m, x8 ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %t1 = call @llvm.aarch64.sve.umin.nxv2i64( %pg, %a, %a) diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll index 2aa298f6d9173f..7344964f13bbad 100644 --- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll @@ -48,8 +48,8 @@ define @test_lane0_2xi64( %a) { define @test_lane0_2xf64( %a) { ; CHECK-LABEL: test_lane0_2xf64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: fmov d1, #1.00000000 +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, z1.d ; CHECK-NEXT: ret %b = insertelement %a, double 1.0, i32 0 @@ -59,8 +59,8 @@ define @test_lane0_2xf64( %a) { define @test_lane0_4xf32( %a) { ; CHECK-LABEL: test_lane0_4xf32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: fmov s1, #1.00000000 +; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret %b = insertelement %a, float 1.0, i32 0 @@ -70,8 +70,8 @@ define @test_lane0_4xf32( %a) { define @test_lane0_8xf16( %a) { ; CHECK-LABEL: test_lane0_8xf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: fmov h1, #1.00000000 +; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret %b = insertelement %a, half 1.0, i32 0 @@ -93,9 +93,9 @@ define @test_lane0_8xbf16( %a, bfloat define @test_lane4_2xi64( %a) { ; CHECK-LABEL: test_lane4_2xi64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #4 // =0x4 ; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov w8, #30 // =0x1e ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d @@ -109,9 +109,9 @@ define @test_lane4_2xi64( %a) { define @test_lane9_8xf16( %a) { ; CHECK-LABEL: test_lane9_8xf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #9 // =0x9 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h ; CHECK-NEXT: fmov h1, #1.00000000 @@ -124,9 +124,9 @@ define @test_lane9_8xf16( %a) { define @test_lane9_8xbf16( %a, bfloat %x) { ; CHECK-LABEL: test_lane9_8xbf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #9 // =0x9 ; CHECK-NEXT: index z2.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h ; CHECK-NEXT: mov z0.h, p0/m, h1 @@ -138,9 +138,9 @@ define @test_lane9_8xbf16( %a, bfloat define @test_lane1_16xi8( %a) { ; CHECK-LABEL: test_lane1_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w8 ; CHECK-NEXT: mov w8, #30 // =0x1e ; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b @@ -153,9 +153,9 @@ define @test_lane1_16xi8( %a) { define @test_lanex_16xi8( %a, i32 %x) { ; CHECK-LABEL: test_lanex_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w8 ; CHECK-NEXT: mov w8, #30 // =0x1e ; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b @@ -179,9 +179,9 @@ define @extract_insert_4xi32( %a) { define @test_lane6_undef_8xi16(i16 %a) { ; CHECK-LABEL: test_lane6_undef_8xi16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #6 // =0x6 ; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h ; CHECK-NEXT: mov z0.h, p0/m, w0 @@ -202,8 +202,8 @@ define @test_lane0_undef_16xi8(i8 %a) { define @test_insert0_of_extract0_16xi8( %a, %b) { ; CHECK-LABEL: test_insert0_of_extract0_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl1 ; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ptrue p0.b, vl1 ; CHECK-NEXT: mov z0.b, p0/m, w8 ; CHECK-NEXT: ret %c = extractelement %b, i32 0 @@ -215,12 +215,12 @@ define @test_insert64_of_extract64_16xi8( % ; CHECK-LABEL: test_insert64_of_extract64_16xi8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #64 // =0x40 -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: whilels p0.b, xzr, x8 ; CHECK-NEXT: mov z2.b, w8 ; CHECK-NEXT: lastb w9, p0, z1.b ; CHECK-NEXT: index z1.b, #0, #1 -; CHECK-NEXT: cmpeq p0.b, p1/z, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, w9 ; CHECK-NEXT: ret %c = extractelement %b, i32 64 @@ -231,9 +231,9 @@ define @test_insert64_of_extract64_16xi8( % define @test_insert3_of_extract1_16xi8( %a, %b) { ; CHECK-LABEL: test_insert3_of_extract1_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z2.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z3.b, w8 ; CHECK-NEXT: umov w8, v1.b[1] ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b @@ -329,9 +329,9 @@ define @test_insert_into_undef_nxv2f64(double %a) { define @test_insert_with_index_nxv2f16(half %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -342,9 +342,9 @@ define @test_insert_with_index_nxv2f16(half %h, i64 %idx) { define @test_insert_with_index_nxv4f16(half %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: index z1.s, #0, #1 ; CHECK-NEXT: mov z2.s, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -355,9 +355,9 @@ define @test_insert_with_index_nxv4f16(half %h, i64 %idx) { define @test_insert_with_index_nxv8f16(half %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: index z1.h, #0, #1 ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -368,9 +368,9 @@ define @test_insert_with_index_nxv8f16(half %h, i64 %idx) { define @test_insert_with_index_nxv2bf16(bfloat %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -381,9 +381,9 @@ define @test_insert_with_index_nxv2bf16(bfloat %h, i64 %id define @test_insert_with_index_nxv4bf16(bfloat %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv4bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: index z1.s, #0, #1 ; CHECK-NEXT: mov z2.s, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -394,9 +394,9 @@ define @test_insert_with_index_nxv4bf16(bfloat %h, i64 %id define @test_insert_with_index_nxv8bf16(bfloat %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: index z1.h, #0, #1 ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -407,9 +407,9 @@ define @test_insert_with_index_nxv8bf16(bfloat %h, i64 %id define @test_insert_with_index_nxv2f32(float %f, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.s, p0/m, s0 ; CHECK-NEXT: ret @@ -420,9 +420,9 @@ define @test_insert_with_index_nxv2f32(float %f, i64 %idx) define @test_insert_with_index_nxv4f32(float %f, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: index z1.s, #0, #1 ; CHECK-NEXT: mov z2.s, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, s0 ; CHECK-NEXT: ret @@ -433,9 +433,9 @@ define @test_insert_with_index_nxv4f32(float %f, i64 %idx) define @test_insert_with_index_nxv2f64(double %d, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.d, p0/m, d0 ; CHECK-NEXT: ret @@ -447,11 +447,11 @@ define @test_insert_with_index_nxv2f64(double %d, i64 %idx define @test_predicate_insert_2xi1_immediate ( %val, i1 %elt) { ; CHECK-LABEL: test_predicate_insert_2xi1_immediate: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d, vl1 ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: mov z0.d, p0/m, x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z0.d, p1/m, x0 ; CHECK-NEXT: and z0.d, z0.d, #0x1 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ret @@ -462,9 +462,9 @@ define @test_predicate_insert_2xi1_immediate ( @test_predicate_insert_4xi1_immediate ( %val, i1 %elt) { ; CHECK-LABEL: test_predicate_insert_4xi1_immediate: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov w8, #2 // =0x2 ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p2.s, p1/z, z0.s, z1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 @@ -479,9 +479,9 @@ define @test_predicate_insert_4xi1_immediate ( @test_predicate_insert_8xi1_immediate ( %val, i32 %idx) { ; CHECK-LABEL: test_predicate_insert_8xi1_immediate: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: cmpeq p2.h, p1/z, z0.h, z1.h @@ -497,9 +497,9 @@ define @test_predicate_insert_8xi1_immediate ( @test_predicate_insert_16xi1_immediate ( %val) { ; CHECK-LABEL: test_predicate_insert_16xi1_immediate: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: mov w8, #4 // =0x4 ; CHECK-NEXT: index z0.b, #0, #1 +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z1.b @@ -516,9 +516,9 @@ define @test_predicate_insert_16xi1_immediate ( @test_predicate_insert_2xi1( %val, i1 %elt, i32 %idx) { ; CHECK-LABEL: test_predicate_insert_2xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: mov w8, w1 ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: mov w8, w1 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: cmpeq p2.d, p1/z, z0.d, z1.d @@ -534,9 +534,9 @@ define @test_predicate_insert_2xi1( %val, i1 define @test_predicate_insert_4xi1( %val, i1 %elt, i32 %idx) { ; CHECK-LABEL: test_predicate_insert_4xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: mov w8, w1 ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: mov w8, w1 +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p2.s, p1/z, z0.s, z1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 @@ -550,9 +550,9 @@ define @test_predicate_insert_4xi1( %val, i1 define @test_predicate_insert_8xi1( %val, i1 %elt, i32 %idx) { ; CHECK-LABEL: test_predicate_insert_8xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: mov w8, w1 ; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: mov w8, w1 +; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: cmpeq p2.h, p1/z, z0.h, z1.h ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 @@ -567,9 +567,9 @@ define @test_predicate_insert_8xi1( %val, i1 define @test_predicate_insert_16xi1( %val, i1 %elt, i32 %idx) { ; CHECK-LABEL: test_predicate_insert_16xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: mov w8, w1 ; CHECK-NEXT: index z0.b, #0, #1 +; CHECK-NEXT: mov w8, w1 +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z1.b ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 @@ -589,24 +589,24 @@ define @test_predicate_insert_32xi1( %val, ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG -; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: rdvl x8, #2 ; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: mov w9, w1 +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1b { z0.b }, p2, [sp, #1, mul vl] -; CHECK-NEXT: st1b { z1.b }, p2, [sp] +; CHECK-NEXT: st1b { z0.b }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p1, [sp] ; CHECK-NEXT: strb w0, [x9, x8] -; CHECK-NEXT: ld1b { z0.b }, p2/z, [sp] -; CHECK-NEXT: ld1b { z1.b }, p2/z, [sp, #1, mul vl] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [sp] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [sp, #1, mul vl] ; CHECK-NEXT: and z0.b, z0.b, #0x1 ; CHECK-NEXT: and z1.b, z1.b, #0x1 -; CHECK-NEXT: cmpne p0.b, p2/z, z0.b, #0 -; CHECK-NEXT: cmpne p1.b, p2/z, z1.b, #0 +; CHECK-NEXT: cmpne p0.b, p1/z, z0.b, #0 +; CHECK-NEXT: cmpne p1.b, p1/z, z1.b, #0 ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll index 4a5e272582d8ec..5efe9e2819d5e8 100644 --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -17,15 +17,15 @@ define @insert_v2i64_nxv2i64_idx2( %vec, <2 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x8 ; CHECK-NEXT: mov w9, #2 // =0x2 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -51,15 +51,15 @@ define @insert_v4i32_nxv4i32_idx4( %vec, <4 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cntw x8 ; CHECK-NEXT: mov w9, #4 // =0x4 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sub x8, x8, #4 ; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -85,15 +85,15 @@ define @insert_v8i16_nxv8i16_idx8( %vec, <8 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: sub x8, x8, #8 ; CHECK-NEXT: cmp x8, #8 +; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -119,15 +119,15 @@ define @insert_v16i8_nxv16i8_idx16( %vec, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: sub x8, x8, #16 -; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: cmp x8, #16 -; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: str q1, [x10, x8] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -239,8 +239,8 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) uwtable { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: str q0, [sp, #16] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [sp] diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll index c0ddceb42e1d0a..52bd79e7a7e60d 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll @@ -55,8 +55,8 @@ define @smax_i16_neg( %a) { define @smax_i16_out_of_range( %a) { ; CHECK-LABEL: smax_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -93,8 +93,8 @@ define @smax_i32_neg( %a) { define @smax_i32_out_of_range( %a) { ; CHECK-LABEL: smax_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #-129 // =0xffffffffffffff7f +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 -129, i32 0 @@ -131,8 +131,8 @@ define @smax_i64_neg( %a) { define @smax_i64_out_of_range( %a) { ; CHECK-LABEL: smax_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -196,8 +196,8 @@ define @smin_i16_neg( %a) { define @smin_i16_out_of_range( %a) { ; CHECK-LABEL: smin_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -234,8 +234,8 @@ define @smin_i32_neg( %a) { define @smin_i32_out_of_range( %a) { ; CHECK-LABEL: smin_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #-129 // =0xffffffffffffff7f +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 -129, i32 0 @@ -272,8 +272,8 @@ define @smin_i64_neg( %a) { define @smin_i64_out_of_range( %a) { ; CHECK-LABEL: smin_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -325,8 +325,8 @@ define @umax_i16_pos( %a) { define @umax_i16_out_of_range( %a) { ; CHECK-LABEL: umax_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -351,8 +351,8 @@ define @umax_i32_pos( %a) { define @umax_i32_out_of_range( %a) { ; CHECK-LABEL: umax_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #257 // =0x101 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -378,8 +378,8 @@ define @umax_i64_pos( %a) { define @umax_i64_out_of_range( %a) { ; CHECK-LABEL: umax_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -431,8 +431,8 @@ define @umin_i16_pos( %a) { define @umin_i16_out_of_range( %a) { ; CHECK-LABEL: umin_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -457,8 +457,8 @@ define @umin_i32_pos( %a) { define @umin_i32_out_of_range( %a) { ; CHECK-LABEL: umin_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #257 // =0x101 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -484,8 +484,8 @@ define @umin_i64_pos( %a) { define @umin_i64_out_of_range( %a) { ; CHECK-LABEL: umin_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -589,8 +589,8 @@ define @mul_i64_pos( %a) { define @mul_i16_range( %a) { ; CHECK-LABEL: mul_i16_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, #255 // =0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 255, i32 0 @@ -602,8 +602,8 @@ define @mul_i16_range( %a) { define @mul_i32_range( %a) { ; CHECK-LABEL: mul_i32_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #255 // =0xff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 255, i32 0 @@ -615,8 +615,8 @@ define @mul_i32_range( %a) { define @mul_i64_range( %a) { ; CHECK-LABEL: mul_i64_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #255 // =0xff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 255, i32 0 @@ -766,8 +766,8 @@ define @lsr_i64( %a){ define @sdiv_const( %a) #0 { ; CHECK-LABEL: sdiv_const: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #3 // =0x3 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -778,8 +778,8 @@ entry: define @udiv_const( %a) #0 { ; CHECK-LABEL: udiv_const: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #3 // =0x3 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll index fc2672f8c80a8c..5f92dee3b53057 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll @@ -532,8 +532,8 @@ define @mls_i64( %a, %b, define @muladd_i64_positiveAddend( %a, %b) ; CHECK-LABEL: muladd_i64_positiveAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: ret { @@ -545,8 +545,8 @@ define @mls_i64( %a, %b, define @muladd_i64_negativeAddend( %a, %b) ; CHECK-LABEL: muladd_i64_negativeAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff00000001 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: ret { @@ -559,8 +559,8 @@ define @muladd_i64_negativeAddend( %a, @muladd_i32_positiveAddend( %a, %b) ; CHECK-LABEL: muladd_i32_positiveAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0x10000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret { @@ -572,8 +572,8 @@ define @muladd_i32_positiveAddend( %a, @muladd_i32_negativeAddend( %a, %b) ; CHECK-LABEL: muladd_i32_negativeAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0xffff0000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret { @@ -585,8 +585,8 @@ define @muladd_i32_negativeAddend( %a, @muladd_i16_positiveAddend( %a, %b) ; CHECK-LABEL: muladd_i16_positiveAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, #255 // =0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: ret { @@ -598,8 +598,8 @@ define @muladd_i16_positiveAddend( %a, @muladd_i16_negativeAddend( %a, %b) ; CHECK-LABEL: muladd_i16_negativeAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, #-255 // =0xffffffffffffff01 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: ret { @@ -611,8 +611,8 @@ define @muladd_i16_negativeAddend( %a, @muladd_i8_positiveAddend( %a, %b) ; CHECK-LABEL: muladd_i8_positiveAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, #15 // =0xf +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b ; CHECK-NEXT: ret { @@ -624,8 +624,8 @@ define @muladd_i8_positiveAddend( %a, @muladd_i8_negativeAddend( %a, %b) ; CHECK-LABEL: muladd_i8_negativeAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, #-15 // =0xfffffffffffffff1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b ; CHECK-NEXT: ret { @@ -748,8 +748,8 @@ define @mulsub_i8_negativeAddend( %a, @multiple_fused_ops( %a, %b) ; CHECK-LABEL: multiple_fused_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #200 // =0xc8 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: mla z2.h, p0/m, z0.h, z1.h ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h @@ -770,8 +770,8 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) { ; CHECK-NEXT: b.lt .LBB70_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: whilelo p1.s, xzr, x9 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: cntw x10 diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll index d04da62451778a..8c1b5225b7f257 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll @@ -378,29 +378,29 @@ declare i8 @llvm.vector.reduce.smin.nxv10i8() define i8 @smin_nxv10i8( %a) { ; CHECK-LABEL: smin_nxv10i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpkhi z2.h, z0.b -; CHECK-NEXT: mov z1.d, #127 // =0x7f +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: mov z3.d, #127 // =0x7f ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uunpklo z3.s, z2.h -; CHECK-NEXT: uunpkhi z2.s, z2.h -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: uzp1 z3.s, z3.s, z1.s -; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h -; CHECK-NEXT: uzp1 z2.b, z0.b, z2.b -; CHECK-NEXT: uunpkhi z2.h, z2.b -; CHECK-NEXT: uunpkhi z3.s, z2.h -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpkhi z3.d, z3.s -; CHECK-NEXT: uzp1 z3.s, z1.s, z3.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z3.h -; CHECK-NEXT: uzp1 z2.b, z0.b, z2.b -; CHECK-NEXT: uunpkhi z2.h, z2.b -; CHECK-NEXT: uunpkhi z3.s, z2.h -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: uzp1 z1.s, z3.s, z1.s +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s ; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z3.s, z2.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: sminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -414,12 +414,12 @@ declare i8 @llvm.vector.reduce.add.nxv12i8() define i8 @uaddv_nxv12i8( %a) { ; CHECK-LABEL: uaddv_nxv12i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpkhi z2.h, z0.b -; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: mov z2.s, #0 // =0x0 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: uaddv d0, p0, z0.b ; CHECK-NEXT: fmov x0, d0 @@ -434,15 +434,15 @@ declare i8 @llvm.vector.reduce.umax.nxv14i8() define i8 @umax_nxv14i8( %a) { ; CHECK-LABEL: umax_nxv14i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpkhi z2.h, z0.b -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: mov z3.d, #0 // =0x0 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uunpkhi z3.s, z2.h -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: uzp1 z1.s, z3.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll index 2464eacd185dd4..bc94c087ef5fe7 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll @@ -239,10 +239,10 @@ define @index_rr_i32_combine(i32 %a, i32 %b) { define @index_rr_i32_not_combine(i32 %a, i32 %b) { ; CHECK-LABEL: index_rr_i32_not_combine: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: index z0.s, #0, #1 ; CHECK-NEXT: mov z1.s, w0 ; CHECK-NEXT: mov z2.s, w1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mla z1.s, p0/m, z0.s, z2.s ; CHECK-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll index 3e453a6b781794..5648e8244e6ec1 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll @@ -247,8 +247,8 @@ define @sub_i32_ptrue_all_h( %a) #0 { define @sub_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: sub_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sub z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -402,8 +402,8 @@ define @subr_i32_ptrue_all_h( %a) #0 { define @subr_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: subr_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: subr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -449,8 +449,8 @@ define @smax_i16( %a) { define @smax_i16_out_of_range( %a) { ; CHECK-LABEL: smax_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #129 // =0x81 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -480,8 +480,8 @@ define @smax_i32( %a) { define @smax_i32_out_of_range( %a) { ; CHECK-LABEL: smax_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #-129 // =0xffffffffffffff7f +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) @@ -510,8 +510,8 @@ define @smax_i64( %a) { define @smax_i64_out_of_range( %a) { ; CHECK-LABEL: smax_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -559,8 +559,8 @@ define @smax_i32_ptrue_all_h( %a) #0 { define @smax_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: smax_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -606,8 +606,8 @@ define @smin_i16( %a) { define @smin_i16_out_of_range( %a) { ; CHECK-LABEL: smin_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, #-129 // =0xffffffffffffff7f +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) @@ -636,8 +636,8 @@ define @smin_i32( %a) { define @smin_i32_out_of_range( %a) { ; CHECK-LABEL: smin_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #257 // =0x101 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -668,8 +668,8 @@ define @smin_i64( %a) { define @smin_i64_out_of_range( %a) { ; CHECK-LABEL: smin_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #-256 // =0xffffffffffffff00 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -717,8 +717,8 @@ define @smin_i32_ptrue_all_h( %a) #0 { define @smin_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: smin_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -764,8 +764,8 @@ define @umax_i16( %a) { define @umax_i16_out_of_range( %a) { ; CHECK-LABEL: umax_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) @@ -794,8 +794,8 @@ define @umax_i32( %a) { define @umax_i32_out_of_range( %a) { ; CHECK-LABEL: umax_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #257 // =0x101 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -825,8 +825,8 @@ define @umax_i64( %a) { define @umax_i64_out_of_range( %a) { ; CHECK-LABEL: umax_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -874,8 +874,8 @@ define @umax_i32_ptrue_all_h( %a) #0 { define @umax_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: umax_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -921,8 +921,8 @@ define @umin_i16( %a) { define @umin_i16_out_of_range( %a) { ; CHECK-LABEL: umin_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) @@ -951,8 +951,8 @@ define @umin_i32( %a) { define @umin_i32_out_of_range( %a) { ; CHECK-LABEL: umin_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #257 // =0x101 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -982,8 +982,8 @@ define @umin_i64( %a) { define @umin_i64_out_of_range( %a) { ; CHECK-LABEL: umin_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -1031,8 +1031,8 @@ define @umin_i32_ptrue_all_h( %a) #0 { define @umin_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: umin_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -2120,8 +2120,8 @@ define @mul_i32_ptrue_all_h( %a) #0 { define @mul_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: mul_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll index 7cdedeee2cadab..5db7ee75c2a8df 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll @@ -261,8 +261,8 @@ define @orr_i32_ptrue_all_h( %a) { define @orr_i32_ptrue_all_d( %a) { ; CHECK-LABEL: orr_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: orr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) diff --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll index da5dc5c5b34d90..619134dc4a696b 100644 --- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll @@ -38,18 +38,18 @@ define @test_post_ld1_dup(ptr %a, ptr %ptr, i64 %inc) { define void @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_ptr) #1 { ; CHECK-LABEL: test_post_ld1_int_fixed: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #2 // =0x2 ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: ptrue p1.d, vl1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ldr x9, [x0, x1, lsl #3] +; CHECK-NEXT: ptrue p2.d, vl1 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2] -; CHECK-NEXT: cmpeq p2.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ldr x9, [x0, x1, lsl #3] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; CHECK-NEXT: mov z0.d, z2.d -; CHECK-NEXT: mov z2.d, p2/m, x9 -; CHECK-NEXT: mov z0.d, p1/m, x8 +; CHECK-NEXT: mov z0.d, p2/m, x8 +; CHECK-NEXT: mov z2.d, p1/m, x9 ; CHECK-NEXT: add z0.d, z0.d, z2.d ; CHECK-NEXT: st1d { z0.d }, p0, [x3] ; CHECK-NEXT: ret @@ -67,18 +67,18 @@ define void @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_pt define void @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_ptr) #1 { ; CHECK-LABEL: test_post_ld1_double_fixed: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #2 // =0x2 ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ptrue p1.d, vl1 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2] -; CHECK-NEXT: cmpeq p2.d, p0/z, z0.d, z1.d -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x0, x1, lsl #3] -; CHECK-NEXT: sel z0.d, p1, z0.d, z2.d -; CHECK-NEXT: mov z2.d, p2/m, d1 -; CHECK-NEXT: fadd z0.d, z0.d, z2.d +; CHECK-NEXT: ptrue p2.d, vl1 +; CHECK-NEXT: ldr d2, [x0, x1, lsl #3] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x2] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: sel z1.d, p2, z1.d, z0.d +; CHECK-NEXT: mov z0.d, p1/m, d2 +; CHECK-NEXT: fadd z0.d, z1.d, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x3] ; CHECK-NEXT: ret %A = load <4 x double>, ptr %addr diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll index e42e2272a2d4ff..fbe82e8591fd0d 100644 --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -20,8 +20,8 @@ define @ld1r_stack() { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: adrp x8, :got:g8 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr x8, [x8, :got_lo12:g8] ; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: strb w8, [sp, #12] @@ -1433,10 +1433,10 @@ define ptr @avoid_preindex_load(ptr %src, ptr %out) { define ptr @avoid_preindex_load_dup(ptr %src, %pg, ptr %out) { ; CHECK-LABEL: avoid_preindex_load_dup: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x0, x0, #1 -; CHECK-NEXT: st1d { z0.d }, p1, [x1] +; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 %tmp = load i8, ptr %ptr, align 4 @@ -1450,10 +1450,10 @@ define ptr @avoid_preindex_load_dup(ptr %src, %pg, ptr %out) { define ptr @avoid_preindex_load_dup_passthru_zero(ptr %src, %pg, ptr %out) { ; CHECK-LABEL: avoid_preindex_load_dup_passthru_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x0, x0, #1 -; CHECK-NEXT: st1d { z0.d }, p1, [x1] +; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 %tmp = load i8, ptr %ptr, align 4 @@ -1467,10 +1467,10 @@ define ptr @avoid_preindex_load_dup_passthru_zero(ptr %src, %p define ptr @preindex_load_dup_passthru( %passthru, ptr %src, %pg, ptr %out) { ; CHECK-LABEL: preindex_load_dup_passthru: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ldrsb x8, [x0, #1]! ; CHECK-NEXT: mov z0.d, p0/m, x8 -; CHECK-NEXT: st1d { z0.d }, p1, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 %tmp = load i8, ptr %ptr, align 4 @@ -1485,8 +1485,8 @@ define ptr @preindex_load_dup_passthru( %passthru, ptr %src, < define ptr @preidx8sext64_instead_of_ld1r(ptr %src, ptr %out, ptr %dst) { ; CHECK-LABEL: preidx8sext64_instead_of_ld1r: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldrsb x8, [x0, #1]! +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: str x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll index 06ec132808154a..5d53c00c527282 100644 --- a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll +++ b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll @@ -38,8 +38,8 @@ define void @ld_st_nxv8i16(ptr %in, ptr %out) { ; ; ASM-LABEL: ld_st_nxv8i16: ; ASM: // %bb.0: // %entry -; ASM-NEXT: ptrue p0.h ; ASM-NEXT: mov z0.h, #3 // =0x3 +; ASM-NEXT: ptrue p0.h ; ASM-NEXT: mov x8, xzr ; ASM-NEXT: cnth x9 ; ASM-NEXT: .LBB0_1: // %loop @@ -111,8 +111,8 @@ define void @masked_ld_st_nxv8i16(ptr %in, ptr %out, i64 %n) { ; ; ASM-LABEL: masked_ld_st_nxv8i16: ; ASM: // %bb.0: // %entry -; ASM-NEXT: ptrue p0.h ; ASM-NEXT: mov z0.h, #3 // =0x3 +; ASM-NEXT: ptrue p0.h ; ASM-NEXT: mov x8, xzr ; ASM-NEXT: cnth x9 ; ASM-NEXT: .LBB1_1: // %loop diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll index e40d65efb158be..dfdfc456ccdba5 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll @@ -126,9 +126,9 @@ define @masked_gather_nxv8f16( %ptrs, @masked_gather_nxv8bf16(ptr %base, %indices, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: sunpkhi z1.s, z0.h ; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, z1.s, sxtw #1] ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1] @@ -175,14 +175,14 @@ define @masked_gather_nxv8f32(ptr %base, define @masked_gather_nxv16i8(ptr %base, %indices, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: sunpkhi z1.h, z0.b +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: sunpkhi z2.s, z1.h -; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: punpkhi p2.h, p1.b ; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: ld1b { z2.s }, p2/z, [x0, z2.s, sxtw] ; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, z1.s, sxtw] ; CHECK-NEXT: punpkhi p1.h, p0.b diff --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll index 40d889f1b501e1..d397424cb162fd 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll @@ -235,9 +235,9 @@ define @masked_sload_x2_8i8_8i64(ptr %a, ptr %b, @masked_zload_nxv8i16(ptr %a, %mask) define @masked_zload_2i16_2f64(ptr noalias %in, %mask) { ; CHECK-LABEL: masked_zload_2i16_2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] -; CHECK-NEXT: ucvtf z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ret %wide.load = call @llvm.masked.load.nxv2i16(ptr %in, i32 2, %mask, undef) %zext = zext %wide.load to @@ -230,9 +230,9 @@ define @masked_zload_x2_8i8_8i64(ptr %a, ptr %b, %data, ptr %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpklo z2.h, z1.b ; CHECK-NEXT: uunpklo z4.h, z0.b -; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpkhi z1.h, z1.b ; CHECK-NEXT: uunpkhi z0.h, z0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: sunpklo z3.s, z2.h ; CHECK-NEXT: uunpklo z5.s, z4.h ; CHECK-NEXT: sunpkhi z2.s, z2.h -; CHECK-NEXT: punpklo p2.h, p1.b -; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: st1b { z5.s }, p2, [x0, z3.s, sxtw] ; CHECK-NEXT: uunpkhi z3.s, z4.h ; CHECK-NEXT: st1b { z3.s }, p1, [x0, z2.s, sxtw] -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1b { z3.s }, p1, [x0, z2.s, sxtw] ; CHECK-NEXT: st1b { z0.s }, p0, [x0, z1.s, sxtw] ; CHECK-NEXT: ret @@ -40,12 +40,12 @@ define void @masked_scatter_nxv16i8( %data, ptr %base, %data, ptr %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1h { z3.s }, p1, [x0, z2.s, sxtw #1] ; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1] ; CHECK-NEXT: ret @@ -57,12 +57,12 @@ define void @masked_scatter_nxv8i16( %data, ptr %base, %data, ptr %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1h { z3.s }, p1, [x0, z2.s, sxtw #1] ; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll index e866474942cd74..94e525d22b8258 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll @@ -76,8 +76,8 @@ define void @masked_scatter_nxv2f64( %data, %pg) { ; CHECK-LABEL: masked_scatter_splat_constant_pointer: ; CHECK: // %bb.0: // %vector.body -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1w { z0.d }, p1, [z0.d] ; CHECK-NEXT: st1w { z0.d }, p0, [z0.d] diff --git a/llvm/test/CodeGen/AArch64/sve-pr62151.ll b/llvm/test/CodeGen/AArch64/sve-pr62151.ll index 7cec20fda429c1..5ed34f14a0b140 100644 --- a/llvm/test/CodeGen/AArch64/sve-pr62151.ll +++ b/llvm/test/CodeGen/AArch64/sve-pr62151.ll @@ -5,8 +5,8 @@ define i32 @build_interpolation(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) { ; CHECK-LABEL: build_interpolation: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: mla v0.2s, v1.2s, v0.s[1] diff --git a/llvm/test/CodeGen/AArch64/sve-pred-arith.ll b/llvm/test/CodeGen/AArch64/sve-pred-arith.ll index 4d46ac5ecbaa95..6e08606db95376 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-arith.ll @@ -54,24 +54,24 @@ define aarch64_sve_vector_pcs @add_nxv64i1( ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue p4.b ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue p6.b ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ldr p5, [x0] -; CHECK-NEXT: ldr p6, [x1] +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ldr p4, [x0] +; CHECK-NEXT: ldr p5, [x1] ; CHECK-NEXT: ldr p7, [x2] ; CHECK-NEXT: ldr p8, [x3] -; CHECK-NEXT: eor p0.b, p4/z, p0.b, p5.b -; CHECK-NEXT: eor p1.b, p4/z, p1.b, p6.b -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: eor p2.b, p4/z, p2.b, p7.b +; CHECK-NEXT: eor p0.b, p6/z, p0.b, p4.b +; CHECK-NEXT: eor p1.b, p6/z, p1.b, p5.b +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: eor p2.b, p6/z, p2.b, p7.b ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: eor p3.b, p4/z, p3.b, p8.b +; CHECK-NEXT: eor p3.b, p6/z, p3.b, p8.b ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 @@ -138,24 +138,24 @@ define aarch64_sve_vector_pcs @sub_nxv64i1( ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue p4.b ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue p6.b ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ldr p5, [x0] -; CHECK-NEXT: ldr p6, [x1] +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ldr p4, [x0] +; CHECK-NEXT: ldr p5, [x1] ; CHECK-NEXT: ldr p7, [x2] ; CHECK-NEXT: ldr p8, [x3] -; CHECK-NEXT: eor p0.b, p4/z, p0.b, p5.b -; CHECK-NEXT: eor p1.b, p4/z, p1.b, p6.b -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: eor p2.b, p4/z, p2.b, p7.b +; CHECK-NEXT: eor p0.b, p6/z, p0.b, p4.b +; CHECK-NEXT: eor p1.b, p6/z, p1.b, p5.b +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: eor p2.b, p6/z, p2.b, p7.b ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: eor p3.b, p4/z, p3.b, p8.b +; CHECK-NEXT: eor p3.b, p6/z, p3.b, p8.b ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll index 600e9c4805ff73..8438e9d88f5de0 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll @@ -322,10 +322,10 @@ entry: define @ornot_v4i32( %z, %x, %y) { ; CHECK-LABEL: ornot_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z3.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z2.d, z2.d, z3.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: eor z2.d, z2.d, z3.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret @@ -340,10 +340,10 @@ entry: define @ornot_v8i16( %z, %x, %y) { ; CHECK-LABEL: ornot_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z3.h, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z2.d, z2.d, z3.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: eor z2.d, z2.d, z3.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret @@ -358,10 +358,10 @@ entry: define @ornot_v16i8( %z, %x, %y) { ; CHECK-LABEL: ornot_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z3.b, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z2.d, z2.d, z3.d +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: eor z2.d, z2.d, z3.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret @@ -904,8 +904,8 @@ define @addqr_v4i32( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: add z1.s, z1.s, z2.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: add z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -922,8 +922,8 @@ define @addqr_v8i16( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: add z1.h, z1.h, z2.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: add z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -940,8 +940,8 @@ define @addqr_v16i8( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: add z1.b, z1.b, z2.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: add z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -958,8 +958,8 @@ define @subqr_v4i32( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: sub z1.s, z1.s, z2.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: sub z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -976,8 +976,8 @@ define @subqr_v8i16( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: sub z1.h, z1.h, z2.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: sub z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -994,8 +994,8 @@ define @subqr_v16i8( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: sub z1.b, z1.b, z2.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: sub z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -1010,10 +1010,10 @@ entry: define @mulqr_v4i32( %z, %x, i32 %y) { ; CHECK-LABEL: mulqr_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: mul z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: mul z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1028,10 +1028,10 @@ entry: define @mulqr_v8i16( %z, %x, i16 %y) { ; CHECK-LABEL: mulqr_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: mul z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1046,10 +1046,10 @@ entry: define @mulqr_v16i8( %z, %x, i8 %y) { ; CHECK-LABEL: mulqr_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: mul z1.b, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: mul z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -1064,11 +1064,11 @@ entry: define @faddqr_v4f32( %z, %x, float %y) { ; CHECK-LABEL: faddqr_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, s2 -; CHECK-NEXT: fadd z1.s, z1.s, z2.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fadd z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1083,11 +1083,11 @@ entry: define @faddqr_v8f16( %z, %x, half %y) { ; CHECK-LABEL: faddqr_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, h2 -; CHECK-NEXT: fadd z1.h, z1.h, z2.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: fadd z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1102,11 +1102,11 @@ entry: define @fsubqr_v4f32( %z, %x, float %y) { ; CHECK-LABEL: fsubqr_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, s2 -; CHECK-NEXT: fsub z1.s, z1.s, z2.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fsub z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1121,11 +1121,11 @@ entry: define @fsubqr_v8f16( %z, %x, half %y) { ; CHECK-LABEL: fsubqr_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, h2 -; CHECK-NEXT: fsub z1.h, z1.h, z2.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: fsub z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1140,11 +1140,11 @@ entry: define @fmulqr_v4f32( %z, %x, float %y) { ; CHECK-LABEL: fmulqr_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, s2 -; CHECK-NEXT: fmul z1.s, z1.s, z2.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fmul z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1159,11 +1159,11 @@ entry: define @fmulqr_v8f16( %z, %x, half %y) { ; CHECK-LABEL: fmulqr_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, h2 -; CHECK-NEXT: fmul z1.h, z1.h, z2.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: fmul z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1178,10 +1178,10 @@ entry: define @sadd_satqr_v4i32( %z, %x, i32 %y) { ; CHECK-LABEL: sadd_satqr_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: sqadd z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: sqadd z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1196,10 +1196,10 @@ entry: define @sadd_satqr_v8i16( %z, %x, i16 %y) { ; CHECK-LABEL: sadd_satqr_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: sqadd z1.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: sqadd z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1214,10 +1214,10 @@ entry: define @sadd_satqr_v16i8( %z, %x, i8 %y) { ; CHECK-LABEL: sadd_satqr_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: sqadd z1.b, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: sqadd z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -1232,10 +1232,10 @@ entry: define @uadd_satqr_v4i32( %z, %x, i32 %y) { ; CHECK-LABEL: uadd_satqr_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: uqadd z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: uqadd z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1250,10 +1250,10 @@ entry: define @uadd_satqr_v8i16( %z, %x, i16 %y) { ; CHECK-LABEL: uadd_satqr_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: uqadd z1.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: uqadd z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1268,10 +1268,10 @@ entry: define @uadd_satqr_v16i8( %z, %x, i8 %y) { ; CHECK-LABEL: uadd_satqr_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: uqadd z1.b, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: uqadd z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -1286,10 +1286,10 @@ entry: define @ssub_satqr_v4i32( %z, %x, i32 %y) { ; CHECK-LABEL: ssub_satqr_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: sqsub z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: sqsub z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1304,10 +1304,10 @@ entry: define @ssub_satqr_v8i16( %z, %x, i16 %y) { ; CHECK-LABEL: ssub_satqr_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: sqsub z1.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: sqsub z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1322,10 +1322,10 @@ entry: define @ssub_satqr_v16i8( %z, %x, i8 %y) { ; CHECK-LABEL: ssub_satqr_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: sqsub z1.b, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: sqsub z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -1340,10 +1340,10 @@ entry: define @usub_satqr_v4i32( %z, %x, i32 %y) { ; CHECK-LABEL: usub_satqr_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: uqsub z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: uqsub z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1358,10 +1358,10 @@ entry: define @usub_satqr_v8i16( %z, %x, i16 %y) { ; CHECK-LABEL: usub_satqr_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: uqsub z1.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: uqsub z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1376,10 +1376,10 @@ entry: define @usub_satqr_v16i8( %z, %x, i8 %y) { ; CHECK-LABEL: usub_satqr_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: uqsub z1.b, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: uqsub z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll index 14bc1b45e79ee3..2541910e080e3c 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll @@ -202,9 +202,9 @@ entry: define @sdiv_nxv8i16_x( %x, %y, %n) { ; CHECK-LABEL: sdiv_nxv8i16_x: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z3.s, z1.h ; CHECK-NEXT: sunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: sunpklo z4.s, z0.h @@ -288,9 +288,9 @@ entry: define @udiv_nxv8i16_x( %x, %y, %n) { ; CHECK-LABEL: udiv_nxv8i16_x: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z3.s, z1.h ; CHECK-NEXT: uunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uunpklo z4.s, z0.h @@ -376,17 +376,17 @@ entry: define @srem_nxv8i16_x( %x, %y, %n) { ; CHECK-LABEL: srem_nxv8i16_x: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z3.s, z1.h ; CHECK-NEXT: sunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z5.s, z0.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: sunpklo z4.s, z1.h ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, #0 -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z3.h +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: ret entry: %c = icmp sgt %n, zeroinitializer @@ -419,8 +419,8 @@ define @srem_nxv16i8_x( %x, %n, zeroinitializer @@ -464,17 +464,17 @@ entry: define @urem_nxv8i16_x( %x, %y, %n) { ; CHECK-LABEL: urem_nxv8i16_x: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z3.s, z1.h ; CHECK-NEXT: uunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z5.s, z0.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uunpklo z4.s, z1.h ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, #0 -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z3.h +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: ret entry: %c = icmp sgt %n, zeroinitializer @@ -507,8 +507,8 @@ define @urem_nxv16i8_x( %x, %n, zeroinitializer @@ -1140,8 +1140,8 @@ define @fdiv_nxv8f16_x( %x, @sdiv_nxv8i16_y( %x, %y, %n) { ; CHECK-LABEL: sdiv_nxv8i16_y: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z3.s, z1.h ; CHECK-NEXT: sunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: sunpklo z4.s, z1.h @@ -1740,9 +1740,9 @@ entry: define @udiv_nxv8i16_y( %x, %y, %n) { ; CHECK-LABEL: udiv_nxv8i16_y: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z3.s, z1.h ; CHECK-NEXT: uunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uunpklo z4.s, z1.h @@ -1830,17 +1830,17 @@ entry: define @srem_nxv8i16_y( %x, %y, %n) { ; CHECK-LABEL: srem_nxv8i16_y: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z3.s, z1.h ; CHECK-NEXT: sunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z5.s, z0.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: sunpklo z4.s, z1.h ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, #0 -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: msb z1.h, p0/m, z3.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z3.h +; CHECK-NEXT: msb z1.h, p0/m, z2.h, z0.h ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -1874,8 +1874,8 @@ define @srem_nxv16i8_y( %x, @urem_nxv8i16_y( %x, %y, %n) { ; CHECK-LABEL: urem_nxv8i16_y: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z3.s, z1.h ; CHECK-NEXT: uunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z5.s, z0.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uunpklo z4.s, z1.h ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, #0 -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: msb z1.h, p0/m, z3.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z3.h +; CHECK-NEXT: msb z1.h, p0/m, z2.h, z0.h ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -1966,8 +1966,8 @@ define @urem_nxv16i8_y( %x, @fdiv_nxv8f16_y( %x, @fmai_nxv4f32_y( %x, %n, zeroinitializer @@ -2871,8 +2871,8 @@ define @fmai_nxv8f16_y( %x, %n, zeroinitializer @@ -2887,8 +2887,8 @@ define @fmai_nxv2f64_y( %x, %n, zeroinitializer @@ -2903,8 +2903,8 @@ define @fma_nxv4f32_y( %x, %n, zeroinitializer @@ -2920,8 +2920,8 @@ define @fma_nxv8f16_y( %x, %n, zeroinitializer @@ -2937,8 +2937,8 @@ define @fma_nxv2f64_y( %x, %n, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll index 0f09f7dac2982d..bafd5abcc7b23b 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll @@ -792,8 +792,8 @@ define @fdiv_nxv8f16_x( %x, @fdiv_nxv8f16_y( %x, @fmai_nxv4f32_y( %x, %n, zeroinitializer @@ -1750,8 +1750,8 @@ define @fmai_nxv8f16_y( %x, %n, zeroinitializer @@ -1766,8 +1766,8 @@ define @fmai_nxv2f64_y( %x, %n, zeroinitializer @@ -1782,8 +1782,8 @@ define @fma_nxv4f32_y( %x, %n, zeroinitializer @@ -1799,8 +1799,8 @@ define @fma_nxv8f16_y( %x, %n, zeroinitializer @@ -1816,8 +1816,8 @@ define @fma_nxv2f64_y( %x, %n, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll index fbadbf7226fd16..8bd38d7bc44dfd 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll @@ -320,8 +320,8 @@ define i1 @cmp32_ptest_any_xx( %pg, %a, %pg, %a, %b) { ; CHECK-LABEL: cmp8_ptest_first_ax: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cmpge p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: ptest p1, p0.b ; CHECK-NEXT: cset w0, mi ; CHECK-NEXT: ret @@ -338,8 +338,8 @@ define i1 @cmp8_ptest_first_ax( %pg, %a, %pg, %a, %b) { ; CHECK-LABEL: cmp8_ptest_last_ax: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cmpge p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: ptest p1, p0.b ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -371,8 +371,8 @@ define i1 @cmp8_ptest_any_ax( %pg, %a, %pg, %a, %b) { ; CHECK-LABEL: cmp32_ptest_first_ax: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: cmpge p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ptest p1, p0.b ; CHECK-NEXT: cset w0, mi ; CHECK-NEXT: ret @@ -390,8 +390,8 @@ define i1 @cmp32_ptest_first_ax( %pg, %a, < define i1 @cmp32_ptest_last_ax( %pg, %a, %b) { ; CHECK-LABEL: cmp32_ptest_last_ax: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: cmpge p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ptest p1, p0.b ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-redundant-store.ll b/llvm/test/CodeGen/AArch64/sve-redundant-store.ll index 6873404724f1d9..508fe5d5a58a57 100644 --- a/llvm/test/CodeGen/AArch64/sve-redundant-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-redundant-store.ll @@ -35,8 +35,8 @@ entry: define void @keep_scalable_store(ptr writeonly %ptr, ptr %a, %b) { ; CHECK-LABEL: keep_scalable_store: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll index a1c2ec9c7e1d42..76190eba870ded 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll @@ -22,15 +22,15 @@ define i8 @split_extract_32i8_idx( %a, i32 %idx) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: rdvl x8, #2 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldrb w0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -46,15 +46,15 @@ define i16 @split_extract_16i16_idx( %a, i32 %idx) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldrh w0, [x9, x8, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -70,15 +70,15 @@ define i32 @split_extract_8i32_idx( %a, i32 %idx) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cnth x8 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldr w0, [x9, x8, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -94,15 +94,15 @@ define i64 @split_extract_8i64_idx( %a, i32 %idx) { ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cnth x8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] ; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: ldr x0, [x9, x8, lsl #3] @@ -140,15 +140,15 @@ define i16 @split_extract_16i16( %a) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, #128 // =0x80 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldrh w0, [x9, x8, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -164,16 +164,16 @@ define i32 @split_extract_16i32( %a) { ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: rdvl x8, #1 ; CHECK-NEXT: mov w9, #34464 // =0x86a0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movk w9, #1, lsl #16 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: ldr w0, [x9, x8, lsl #2] @@ -191,15 +191,15 @@ define i64 @split_extract_4i64( %a) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntw x8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w9, #10 // =0xa ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x8, #10 -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldr x0, [x9, x8, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll index 3997409172d03b..bc015116917d88 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll @@ -6,9 +6,9 @@ define @fcvts_nxv8f16( %a) { ; CHECK-LABEL: fcvts_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z1.s, z0.h ; CHECK-NEXT: uunpkhi z2.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: fcvt z0.s, p0/m, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -21,9 +21,9 @@ define @fcvts_nxv8f16( %a) { define @fcvtd_nxv4f16( %a) { ; CHECK-LABEL: fcvtd_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: fcvt z0.d, p0/m, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -37,8 +37,8 @@ define @fcvtd_nxv8f16( %a) { ; CHECK-LABEL: fcvtd_nxv8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z2.d, z1.s ; CHECK-NEXT: uunpkhi z1.d, z1.s ; CHECK-NEXT: uunpklo z3.d, z0.s @@ -58,9 +58,9 @@ define @fcvtd_nxv8f16( %a) { define @fcvtd_nxv4f32( %a) { ; CHECK-LABEL: fcvtd_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: fcvt z0.d, p0/m, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -73,11 +73,11 @@ define @fcvtd_nxv4f32( %a) { define @fcvtd_nxv8f32( %a) { ; CHECK-LABEL: fcvtd_nxv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: uunpkhi z3.d, z0.s ; CHECK-NEXT: uunpklo z4.d, z1.s ; CHECK-NEXT: uunpkhi z5.d, z1.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z2 ; CHECK-NEXT: fcvt z0.d, p0/m, z2.s ; CHECK-NEXT: movprfx z1, z3 @@ -195,9 +195,9 @@ define @fcvtzs_h_nxv8f64( %a) { define @fcvtzs_d_nxv4f32( %a) { ; CHECK-LABEL: fcvtzs_d_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -210,11 +210,11 @@ define @fcvtzs_d_nxv4f32( %a) { define @fcvtzs_s_nxv16f16( %a) { ; CHECK-LABEL: fcvtzs_s_nxv16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: uunpkhi z3.s, z0.h ; CHECK-NEXT: uunpklo z4.s, z1.h ; CHECK-NEXT: uunpkhi z5.s, z1.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movprfx z0, z2 ; CHECK-NEXT: fcvtzs z0.s, p0/m, z2.h ; CHECK-NEXT: movprfx z1, z3 @@ -247,9 +247,9 @@ define @fcvtzu_s_nxv4f64( %a) { define @fcvtzu_d_nxv4f32( %a) { ; CHECK-LABEL: fcvtzu_d_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -295,8 +295,8 @@ define @scvtf_s_nxv16i8( %a) { ; CHECK-LABEL: scvtf_s_nxv16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: sunpklo z1.h, z0.b -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z0.h, z0.b +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h @@ -316,9 +316,9 @@ define @scvtf_s_nxv16i8( %a) { define @scvtf_d_nxv4i32( %a) { ; CHECK-LABEL: scvtf_d_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sunpklo z1.d, z0.s ; CHECK-NEXT: sunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: scvtf z0.d, p0/m, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -333,8 +333,8 @@ define @scvtf_d_nxv4i1( %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: scvtf z0.d, p1/m, z0.d ; CHECK-NEXT: scvtf z1.d, p1/m, z1.d @@ -378,9 +378,9 @@ define @ucvtf_h_nxv8i64( %a) { define @ucvtf_d_nxv4i32( %a) { ; CHECK-LABEL: ucvtf_d_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ucvtf z0.d, p0/m, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -395,8 +395,8 @@ define @ucvtf_d_nxv4i1( %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p2/z, #1 // =0x1 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 ; CHECK-NEXT: ucvtf z0.d, p1/m, z0.d ; CHECK-NEXT: ucvtf z1.d, p1/m, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll index 7f642882eddbee..696b6c34ef0415 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll @@ -23,8 +23,8 @@ define double @fadda_nxv8f64(double %init, %a) { define float @faddv_nxv8f32(float %init, %a) { ; CHECK-LABEL: faddv_nxv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fadd z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: faddv s1, p0, z1.s ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll index 5441659fa5cb45..75366384cb750f 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll @@ -6,9 +6,9 @@ define @promote_insert_8i8( %a, i8 %elt, i64 %idx) { ; CHECK-LABEL: promote_insert_8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: index z1.h, #0, #1 ; CHECK-NEXT: mov z2.h, w1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, w0 ; CHECK-NEXT: ret @@ -23,13 +23,13 @@ define @split_insert_32i8_idx( %a, i8 %elt, ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: rdvl x8, #2 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x1, x8 -; CHECK-NEXT: csel x8, x1, x8, lo ; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl] +; CHECK-NEXT: csel x8, x1, x8, lo ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: strb w0, [x9, x8] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] @@ -48,13 +48,13 @@ define @split_insert_8f32_idx( %a, floa ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cnth x8 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x0, x8 -; CHECK-NEXT: csel x8, x0, x8, lo ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: csel x8, x0, x8, lo ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: str s2, [x9, x8, lsl #2] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] @@ -73,13 +73,13 @@ define @split_insert_8i64_idx( %a, i64 %elt ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cnth x8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x1, x8 -; CHECK-NEXT: csel x8, x1, x8, lo ; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] +; CHECK-NEXT: csel x8, x1, x8, lo ; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] @@ -100,9 +100,9 @@ define @split_insert_8i64_idx( %a, i64 %elt define @promote_insert_4i16( %a, i16 %elt) { ; CHECK-LABEL: promote_insert_4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #5 // =0x5 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, w0 @@ -117,9 +117,9 @@ define @promote_insert_4i16( %a, i16 %elt) define @split_insert_32i8( %a, i8 %elt) { ; CHECK-LABEL: split_insert_32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z2.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z3.b, w8 ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b ; CHECK-NEXT: mov z0.b, p0/m, w0 @@ -135,14 +135,14 @@ define @split_insert_32i16( %a, i16 %elt) ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: rdvl x8, #2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, #128 // =0x80 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl] ; CHECK-NEXT: st1h { z2.h }, p0, [sp, #2, mul vl] ; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [sp] @@ -165,15 +165,15 @@ define @split_insert_8i32( %a, i32 %elt) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #16960 // =0x4240 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movk w9, #15, lsl #16 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: str w0, [x9, x8, lsl #2] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] diff --git a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll index 42f3a163d14cc0..dd7b15ef5ee6f4 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll @@ -17,8 +17,8 @@ define i8 @andv_nxv8i8( %a) { define i32 @andv_nxv8i32( %a) { ; CHECK-LABEL: andv_nxv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -71,8 +71,8 @@ define i16 @xorv_nxv2i16( %a) { define i32 @xorv_nxv8i32( %a) { ; CHECK-LABEL: xorv_nxv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -97,8 +97,8 @@ define i16 @uaddv_nxv4i16( %a) { define i16 @uaddv_nxv16i16( %a) { ; CHECK-LABEL: uaddv_nxv16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: uaddv d0, p0, z0.h ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -127,8 +127,8 @@ define i32 @uaddv_nxv16i32( %a) { define i32 @umin_nxv2i32( %a) { ; CHECK-LABEL: umin_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uminv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 diff --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll index af03059cf0d8bb..754f0339702dc7 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll @@ -93,8 +93,8 @@ define @masked_load_split_32i16(ptr %a, % ; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpklo p3.h, p1.b -; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: ld1h { z0.h }, p2/z, [x0] +; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl] ; CHECK-NEXT: ld1h { z2.h }, p3/z, [x0, #2, mul vl] ; CHECK-NEXT: ld1h { z3.h }, p1/z, [x0, #3, mul vl] @@ -123,8 +123,8 @@ define @masked_load_split_8i64(ptr %a, %pg) ; CHECK-NEXT: punpklo p2.h, p1.b ; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: punpklo p3.h, p0.b -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0] +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x0, #1, mul vl] ; CHECK-NEXT: ld1d { z2.d }, p3/z, [x0, #2, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #3, mul vl] diff --git a/llvm/test/CodeGen/AArch64/sve-split-store.ll b/llvm/test/CodeGen/AArch64/sve-split-store.ll index 90ec783ea4dbcb..affa9a18ac1824 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-store.ll @@ -81,8 +81,8 @@ define void @masked_store_split_32i16( %data, ptr %a, %data, ptr %a, %data, ptr %a, i32 1, %pg) diff --git a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll index 9c3d4b1e5a8100..f556d60d23b88e 100644 --- a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll +++ b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll @@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu" define @srem_combine_loop( %a) #0 { ; CHECK-LABEL: srem_combine_loop: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #2 // =0x2 ; CHECK-NEXT: asrd z1.s, p0/m, z1.s, #1 ; CHECK-NEXT: mls z0.s, p0/m, z1.s, z2.s diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll index 728041d3f916b4..3273e6b384f637 100644 --- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll @@ -105,8 +105,8 @@ define void @st1d_inbound( %data, ptr %a) { define void @store_nxv2f32(ptr %out) { ; CHECK-LABEL: store_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fmov z0.s, #1.00000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1w { z0.d }, p0, [x0] ; CHECK-NEXT: ret %ins = insertelement undef, float 1.0, i32 0 @@ -118,8 +118,8 @@ define void @store_nxv2f32(ptr %out) { define void @store_nxv4f16(ptr %out) { ; CHECK-LABEL: store_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fmov z0.h, #1.00000000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret %ins = insertelement undef, half 1.0, i32 0 @@ -133,9 +133,9 @@ define void @store_nxv4f16(ptr %out) { define void @store_nxv6f32(ptr %out) { ; CHECK-LABEL: store_nxv6f32: ; CHECK: // %bb.0: +; CHECK-NEXT: fmov z0.s, #1.00000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: fmov z0.s, #1.00000000 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, #2, mul vl] ; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret @@ -148,9 +148,9 @@ define void @store_nxv6f32(ptr %out) { define void @store_nxv12f16(ptr %out) { ; CHECK-LABEL: store_nxv12f16: ; CHECK: // %bb.0: +; CHECK-NEXT: fmov z0.h, #1.00000000 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: fmov z0.h, #1.00000000 ; CHECK-NEXT: st1h { z0.s }, p0, [x0, #2, mul vl] ; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-stepvector.ll b/llvm/test/CodeGen/AArch64/sve-stepvector.ll index 6f5a31248de7ed..4c5f27d3e7093e 100644 --- a/llvm/test/CodeGen/AArch64/sve-stepvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-stepvector.ll @@ -208,9 +208,9 @@ entry: define @multiple_use_stepvector_nxv4i32_1(i32 %data) { ; CHECK-LABEL: multiple_use_stepvector_nxv4i32_1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: index z0.s, w0, #1 ; CHECK-NEXT: mov z1.s, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mul z1.s, p0/m, z1.s, z0.s ; CHECK-NEXT: sub z0.s, z1.s, z0.s ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll index be5c318e675df2..d547f99a0230a6 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll @@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i8> @ctlz_v4i8(<4 x i8> %op) { ; CHECK-LABEL: ctlz_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: sub z0.h, z0.h, #8 // =0x8 @@ -49,8 +49,8 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) { define void @ctlz_v32i8(ptr %a) { ; CHECK-LABEL: ctlz_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: clz z0.b, p0/m, z0.b ; CHECK-NEXT: clz z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -64,8 +64,8 @@ define void @ctlz_v32i8(ptr %a) { define <2 x i16> @ctlz_v2i16(<2 x i16> %op) { ; CHECK-LABEL: ctlz_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: sub z0.s, z0.s, #16 // =0x10 @@ -102,8 +102,8 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) { define void @ctlz_v16i16(ptr %a) { ; CHECK-LABEL: ctlz_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: clz z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -141,8 +141,8 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) { define void @ctlz_v8i32(ptr %a) { ; CHECK-LABEL: ctlz_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: clz z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -180,8 +180,8 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) { define void @ctlz_v4i64(ptr %a) { ; CHECK-LABEL: ctlz_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: clz z0.d, p0/m, z0.d ; CHECK-NEXT: clz z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -199,8 +199,8 @@ define void @ctlz_v4i64(ptr %a) { define <4 x i8> @ctpop_v4i8(<4 x i8> %op) { ; CHECK-LABEL: ctpop_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: cnt z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -236,8 +236,8 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) { define void @ctpop_v32i8(ptr %a) { ; CHECK-LABEL: ctpop_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: cnt z0.b, p0/m, z0.b ; CHECK-NEXT: cnt z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -251,8 +251,8 @@ define void @ctpop_v32i8(ptr %a) { define <2 x i16> @ctpop_v2i16(<2 x i16> %op) { ; CHECK-LABEL: ctpop_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: cnt z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -288,8 +288,8 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) { define void @ctpop_v16i16(ptr %a) { ; CHECK-LABEL: ctpop_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: cnt z0.h, p0/m, z0.h ; CHECK-NEXT: cnt z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -327,8 +327,8 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) { define void @ctpop_v8i32(ptr %a) { ; CHECK-LABEL: ctpop_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: cnt z0.s, p0/m, z0.s ; CHECK-NEXT: cnt z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -366,8 +366,8 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) { define void @ctpop_v4i64(ptr %a) { ; CHECK-LABEL: ctpop_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cnt z0.d, p0/m, z0.d ; CHECK-NEXT: cnt z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -385,8 +385,8 @@ define void @ctpop_v4i64(ptr %a) { define <4 x i8> @cttz_v4i8(<4 x i8> %op) { ; CHECK-LABEL: cttz_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: orr z0.h, z0.h, #0x100 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h ; CHECK-NEXT: clz z0.h, p0/m, z0.h @@ -425,8 +425,8 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) { define void @cttz_v32i8(ptr %a) { ; CHECK-LABEL: cttz_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b ; CHECK-NEXT: rbit z1.b, p0/m, z1.b ; CHECK-NEXT: clz z0.b, p0/m, z0.b @@ -442,8 +442,8 @@ define void @cttz_v32i8(ptr %a) { define <2 x i16> @cttz_v2i16(<2 x i16> %op) { ; CHECK-LABEL: cttz_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: orr z0.s, z0.s, #0x10000 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s ; CHECK-NEXT: clz z0.s, p0/m, z0.s @@ -482,8 +482,8 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) { define void @cttz_v16i16(ptr %a) { ; CHECK-LABEL: cttz_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h ; CHECK-NEXT: rbit z1.h, p0/m, z1.h ; CHECK-NEXT: clz z0.h, p0/m, z0.h @@ -525,8 +525,8 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) { define void @cttz_v8i32(ptr %a) { ; CHECK-LABEL: cttz_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s ; CHECK-NEXT: rbit z1.s, p0/m, z1.s ; CHECK-NEXT: clz z0.s, p0/m, z0.s @@ -568,8 +568,8 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) { define void @cttz_v4i64(ptr %a) { ; CHECK-LABEL: cttz_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d ; CHECK-NEXT: rbit z1.d, p0/m, z1.d ; CHECK-NEXT: clz z0.d, p0/m, z0.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 251a7c3b18a9ff..0aefba2d4c6abe 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -169,15 +169,15 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap) { ; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: mov x8, #8 // =0x8 ; CHECK-NEXT: ld1h { z2.d }, p0/z, [x0, x9, lsl #1] -; CHECK-NEXT: mov x9, #10 // =0xa ; CHECK-NEXT: ld1h { z3.d }, p0/z, [x0, x10, lsl #1] -; CHECK-NEXT: mov x10, #12 // =0xc +; CHECK-NEXT: mov x9, #10 // =0xa ; CHECK-NEXT: ld1h { z4.d }, p0/z, [x0, x8, lsl #1] -; CHECK-NEXT: mov x8, #14 // =0xe +; CHECK-NEXT: mov x8, #12 // =0xc +; CHECK-NEXT: mov x10, #14 // =0xe ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1h { z5.d }, p0/z, [x0, x9, lsl #1] -; CHECK-NEXT: ld1h { z6.d }, p0/z, [x0, x10, lsl #1] -; CHECK-NEXT: ld1h { z7.d }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z6.d }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z7.d }, p0/z, [x0, x10, lsl #1] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index 2ace0bca274af1..0d6675def8b52f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -241,8 +241,8 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) { ; SVE-NEXT: ptrue p0.d ; SVE-NEXT: ldr q0, [x1] ; SVE-NEXT: ldr d1, [x0] -; SVE-NEXT: and z1.s, z1.s, #0x7fffffff ; SVE-NEXT: fcvt z0.s, p0/m, z0.d +; SVE-NEXT: and z1.s, z1.s, #0x7fffffff ; SVE-NEXT: uzp1 z0.s, z0.s, z0.s ; SVE-NEXT: and z0.s, z0.s, #0x80000000 ; SVE-NEXT: orr z0.d, z1.d, z0.d @@ -274,8 +274,8 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) { define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) { ; SVE-LABEL: test_copysign_v4f32_v4f64: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.d ; SVE-NEXT: ldp q0, q1, [x1] +; SVE-NEXT: ptrue p0.d ; SVE-NEXT: fcvt z1.s, p0/m, z1.d ; SVE-NEXT: fcvt z0.s, p0/m, z0.d ; SVE-NEXT: ptrue p0.s, vl2 @@ -291,8 +291,8 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) { ; ; SVE2-LABEL: test_copysign_v4f32_v4f64: ; SVE2: // %bb.0: -; SVE2-NEXT: ptrue p0.d ; SVE2-NEXT: ldp q0, q1, [x1] +; SVE2-NEXT: ptrue p0.d ; SVE2-NEXT: ldr q2, [x0] ; SVE2-NEXT: fcvt z1.s, p0/m, z1.d ; SVE2-NEXT: fcvt z0.s, p0/m, z0.d @@ -319,8 +319,8 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) { ; SVE: // %bb.0: ; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldr q0, [x0] -; SVE-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; SVE-NEXT: ld1w { z1.d }, p0/z, [x1] +; SVE-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; SVE-NEXT: fcvt z1.d, p0/m, z1.s ; SVE-NEXT: and z1.d, z1.d, #0x8000000000000000 ; SVE-NEXT: orr z0.d, z0.d, z1.d @@ -354,10 +354,10 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) { ; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: mov x8, #2 // =0x2 ; SVE-NEXT: ldp q2, q3, [x0] -; SVE-NEXT: and z2.d, z2.d, #0x7fffffffffffffff -; SVE-NEXT: and z3.d, z3.d, #0x7fffffffffffffff ; SVE-NEXT: ld1w { z0.d }, p0/z, [x1] ; SVE-NEXT: ld1w { z1.d }, p0/z, [x1, x8, lsl #2] +; SVE-NEXT: and z2.d, z2.d, #0x7fffffffffffffff +; SVE-NEXT: and z3.d, z3.d, #0x7fffffffffffffff ; SVE-NEXT: fcvt z0.d, p0/m, z0.s ; SVE-NEXT: fcvt z1.d, p0/m, z1.s ; SVE-NEXT: and z0.d, z0.d, #0x8000000000000000 @@ -397,8 +397,8 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) { ; SVE-NEXT: ptrue p0.s ; SVE-NEXT: ldr q0, [x1] ; SVE-NEXT: ldr d1, [x0] -; SVE-NEXT: and z1.h, z1.h, #0x7fff ; SVE-NEXT: fcvt z0.h, p0/m, z0.s +; SVE-NEXT: and z1.h, z1.h, #0x7fff ; SVE-NEXT: uzp1 z0.h, z0.h, z0.h ; SVE-NEXT: and z0.h, z0.h, #0x8000 ; SVE-NEXT: orr z0.d, z1.d, z0.d @@ -429,13 +429,13 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { ; SVE: // %bb.0: ; SVE-NEXT: ldp q0, q1, [x1] ; SVE-NEXT: ptrue p0.s, vl2 -; SVE-NEXT: ptrue p1.s ; SVE-NEXT: fcvtxn v1.2s, v1.2d ; SVE-NEXT: fcvtxn v0.2s, v0.2d ; SVE-NEXT: splice z0.s, p0, z0.s, z1.s +; SVE-NEXT: ptrue p0.s ; SVE-NEXT: ldr d1, [x0] ; SVE-NEXT: and z1.h, z1.h, #0x7fff -; SVE-NEXT: fcvt z0.h, p1/m, z0.s +; SVE-NEXT: fcvt z0.h, p0/m, z0.s ; SVE-NEXT: uzp1 z0.h, z0.h, z0.h ; SVE-NEXT: and z0.h, z0.h, #0x8000 ; SVE-NEXT: orr z0.d, z1.d, z0.d @@ -446,13 +446,13 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { ; SVE2: // %bb.0: ; SVE2-NEXT: ldp q0, q1, [x1] ; SVE2-NEXT: ptrue p0.s, vl2 -; SVE2-NEXT: ptrue p1.s ; SVE2-NEXT: ldr d2, [x0] ; SVE2-NEXT: fcvtxn v1.2s, v1.2d ; SVE2-NEXT: fcvtxn v0.2s, v0.2d ; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s +; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: mov z1.h, #32767 // =0x7fff -; SVE2-NEXT: fcvt z0.h, p1/m, z0.s +; SVE2-NEXT: fcvt z0.h, p0/m, z0.s ; SVE2-NEXT: uzp1 z0.h, z0.h, z0.h ; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d ; SVE2-NEXT: str d2, [x0] @@ -470,8 +470,8 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) { ; SVE-LABEL: test_copysign_v8f16_v8f32: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s ; SVE-NEXT: ldp q0, q1, [x1] +; SVE-NEXT: ptrue p0.s ; SVE-NEXT: fcvt z1.h, p0/m, z1.s ; SVE-NEXT: fcvt z0.h, p0/m, z0.s ; SVE-NEXT: ptrue p0.h, vl4 @@ -487,8 +487,8 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) { ; ; SVE2-LABEL: test_copysign_v8f16_v8f32: ; SVE2: // %bb.0: -; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: ldp q0, q1, [x1] +; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: ldr q2, [x0] ; SVE2-NEXT: fcvt z1.h, p0/m, z1.s ; SVE2-NEXT: fcvt z0.h, p0/m, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll index c436dea8ff1b2e..c2d6ed4e9ccf95 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll @@ -50,8 +50,8 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fadd_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -94,8 +94,8 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fadd_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -125,8 +125,8 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fadd_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -186,8 +186,8 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fdiv_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fdiv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fdivr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -230,8 +230,8 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fdiv_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fdiv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fdivr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -261,8 +261,8 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fdiv_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fdiv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fdivr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -325,8 +325,8 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.h, p0/m, z2.h, z1.h @@ -373,8 +373,8 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.s, p0/m, z2.s, z1.s @@ -407,8 +407,8 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.d, p0/m, z2.d, z1.d @@ -470,8 +470,8 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fmul_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fmul_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -514,8 +514,8 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fmul_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fmul_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -545,8 +545,8 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fmul_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fmul_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -603,8 +603,8 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) { define void @fneg_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fneg_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fneg z0.h, p0/m, z0.h ; CHECK-NEXT: fneg z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -642,8 +642,8 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) { define void @fneg_v8f32(ptr %a) { ; CHECK-LABEL: fneg_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fneg z0.s, p0/m, z0.s ; CHECK-NEXT: fneg z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -669,8 +669,8 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) { define void @fneg_v4f64(ptr %a) { ; CHECK-LABEL: fneg_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fneg z0.d, p0/m, z0.d ; CHECK-NEXT: fneg z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -724,8 +724,8 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) { define void @fsqrt_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fsqrt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h ; CHECK-NEXT: fsqrt z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -763,8 +763,8 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) { define void @fsqrt_v8f32(ptr %a) { ; CHECK-LABEL: fsqrt_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s ; CHECK-NEXT: fsqrt z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -790,8 +790,8 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) { define void @fsqrt_v4f64(ptr %a) { ; CHECK-LABEL: fsqrt_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d ; CHECK-NEXT: fsqrt z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -848,8 +848,8 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fsub_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fsub_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fsubr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -892,8 +892,8 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fsub_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fsub_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fsubr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -923,8 +923,8 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fsub_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fsub_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fsubr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -981,8 +981,8 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) { define void @fabs_v16f16(ptr %a) { ; CHECK-LABEL: fabs_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fabs z0.h, p0/m, z0.h ; CHECK-NEXT: fabs z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -1020,8 +1020,8 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) { define void @fabs_v8f32(ptr %a) { ; CHECK-LABEL: fabs_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fabs z0.s, p0/m, z0.s ; CHECK-NEXT: fabs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -1047,8 +1047,8 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) { define void @fabs_v4f64(ptr %a) { ; CHECK-LABEL: fabs_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fabs z0.d, p0/m, z0.d ; CHECK-NEXT: fabs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll index aad078f035f7d6..e92694d1fc80d2 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll @@ -57,8 +57,8 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_oeq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h @@ -107,8 +107,8 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_oeq_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s @@ -157,8 +157,8 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_oeq_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d @@ -181,8 +181,8 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) { define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ueq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p2.h, p0/z, z1.h, z0.h @@ -209,8 +209,8 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_one_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmgt p2.h, p0/z, z1.h, z0.h @@ -237,8 +237,8 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmne p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmne p0.h, p0/z, z2.h, z3.h @@ -261,8 +261,8 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ogt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h @@ -285,8 +285,8 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ugt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h @@ -312,8 +312,8 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_olt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h @@ -336,8 +336,8 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ult_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h @@ -363,8 +363,8 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_oge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h @@ -387,8 +387,8 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_uge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h @@ -414,8 +414,8 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ole_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h @@ -438,8 +438,8 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ule_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h @@ -465,8 +465,8 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_uno_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z2.h, z3.h @@ -489,8 +489,8 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ord_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z2.h, z3.h @@ -516,8 +516,8 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_eq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h @@ -540,8 +540,8 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ne_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmne p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmne p0.h, p0/z, z2.h, z3.h @@ -564,8 +564,8 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_gt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h @@ -588,8 +588,8 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_lt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h @@ -612,8 +612,8 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h @@ -636,8 +636,8 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_le_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll index 18f9a4d371d0cf..9bdde14e8d83df 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll @@ -8,9 +8,9 @@ target triple = "aarch64-unknown-linux-gnu" define void @fp_convert_combine_crash(ptr %a, ptr %b) { ; CHECK-LABEL: fp_convert_combine_crash: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmov z0.s, #8.00000000 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z0.s ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll index 28e02da53af434..244a4051017395 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll @@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu" define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) { ; CHECK-LABEL: fcvt_v2f16_to_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h ; CHECK-NEXT: str d0, [x0] @@ -25,8 +25,8 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) { define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) { ; CHECK-LABEL: fcvt_v4f16_to_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h ; CHECK-NEXT: str q0, [x0] @@ -371,8 +371,8 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) { define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvt_v8f32_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: fcvt z1.h, p0/m, z1.s @@ -420,8 +420,8 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) { define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvt_v4f64_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x8, #2 // =0x2 ; CHECK-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: fcvt z1.h, p0/m, z1.d @@ -467,8 +467,8 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) { define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) { ; CHECK-LABEL: fcvt_v4f64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x8, #2 // =0x2 ; CHECK-NEXT: fcvt z0.s, p0/m, z0.d ; CHECK-NEXT: fcvt z1.s, p0/m, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll index b5df97f767c13b..478be9ab76dd9f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll @@ -40,8 +40,8 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.h, p0/m, z2.h, z1.h @@ -91,8 +91,8 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.s, p0/m, z2.s, z1.s @@ -140,8 +140,8 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.d, p0/m, z2.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll index 07a67e26502909..4dc034adf459ae 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll @@ -37,8 +37,8 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fmaxnm_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fmaxnm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -81,8 +81,8 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fmaxnm_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fmaxnm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -123,8 +123,8 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fmaxnm_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fmaxnm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -171,8 +171,8 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fminnm_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fminnm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -215,8 +215,8 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fminnm_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fminnm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -257,8 +257,8 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fminnm_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fminnm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -305,8 +305,8 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fmax_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fmax_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -349,8 +349,8 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fmax_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fmax_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -391,8 +391,8 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fmax_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fmax_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -439,8 +439,8 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fmin_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fmin_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -483,8 +483,8 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fmin_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fmin_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -525,8 +525,8 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fmin_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fmin_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll index d2d771c48c2044..bd10a0e091c0d4 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll @@ -211,8 +211,8 @@ define half @faddv_v8f16(half %start, <8 x half> %a) { define half @faddv_v16f16(half %start, ptr %a) { ; CHECK-LABEL: faddv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z2.h ; CHECK-NEXT: faddv h1, p0, z1.h ; CHECK-NEXT: fadd h0, h0, h1 @@ -249,8 +249,8 @@ define float @faddv_v4f32(float %start, <4 x float> %a) { define float @faddv_v8f32(float %start, ptr %a) { ; CHECK-LABEL: faddv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: faddv s1, p0, z1.s ; CHECK-NEXT: fadd s0, s0, s1 @@ -285,8 +285,8 @@ define double @faddv_v2f64(double %start, <2 x double> %a) { define double @faddv_v4f64(double %start, ptr %a) { ; CHECK-LABEL: faddv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: fadd d0, d0, d1 @@ -327,8 +327,8 @@ define half @fmaxv_v8f16(<8 x half> %a) { define half @fmaxv_v16f16(ptr %a) { ; CHECK-LABEL: fmaxv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: fmaxnmv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 @@ -365,8 +365,8 @@ define float @fmaxv_v4f32(<4 x float> %a) { define float @fmaxv_v8f32(ptr %a) { ; CHECK-LABEL: fmaxv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fmaxnmv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -401,8 +401,8 @@ define double @fmaxv_v2f64(<2 x double> %a) { define double @fmaxv_v4f64(ptr %a) { ; CHECK-LABEL: fmaxv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: fmaxnmv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -443,8 +443,8 @@ define half @fminv_v8f16(<8 x half> %a) { define half @fminv_v16f16(ptr %a) { ; CHECK-LABEL: fminv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: fminnmv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 @@ -481,8 +481,8 @@ define float @fminv_v4f32(<4 x float> %a) { define float @fminv_v8f32(ptr %a) { ; CHECK-LABEL: fminv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fminnmv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -517,8 +517,8 @@ define double @fminv_v2f64(<2 x double> %a) { define double @fminv_v4f64(ptr %a) { ; CHECK-LABEL: fminv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: fminnmv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -559,8 +559,8 @@ define half @fmaximumv_v8f16(<8 x half> %a) { define half @fmaximumv_v16f16(ptr %a) { ; CHECK-LABEL: fmaximumv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: fmaxv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 @@ -597,8 +597,8 @@ define float @fmaximumv_v4f32(<4 x float> %a) { define float @fmaximumv_v8f32(ptr %a) { ; CHECK-LABEL: fmaximumv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fmaxv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -633,8 +633,8 @@ define double @fmaximumv_v2f64(<2 x double> %a) { define double @fmaximumv_v4f64(ptr %a) { ; CHECK-LABEL: fmaximumv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: fmaxv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -675,8 +675,8 @@ define half @fminimumv_v8f16(<8 x half> %a) { define half @fminimumv_v16f16(ptr %a) { ; CHECK-LABEL: fminimumv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: fminv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 @@ -713,8 +713,8 @@ define float @fminimumv_v4f32(<4 x float> %a) { define float @fminimumv_v8f32(ptr %a) { ; CHECK-LABEL: fminimumv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fminv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -749,8 +749,8 @@ define double @fminimumv_v2f64(<2 x double> %a) { define double @fminimumv_v4f64(ptr %a) { ; CHECK-LABEL: fminimumv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: fminv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll index 580b43531070fc..24832d807c649c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll @@ -47,8 +47,8 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) { define void @frintp_v16f16(ptr %a) { ; CHECK-LABEL: frintp_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frintp z0.h, p0/m, z0.h ; CHECK-NEXT: frintp z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -86,8 +86,8 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) { define void @frintp_v8f32(ptr %a) { ; CHECK-LABEL: frintp_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frintp z0.s, p0/m, z0.s ; CHECK-NEXT: frintp z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -123,8 +123,8 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) { define void @frintp_v4f64(ptr %a) { ; CHECK-LABEL: frintp_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frintp z0.d, p0/m, z0.d ; CHECK-NEXT: frintp z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -178,8 +178,8 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) { define void @frintm_v16f16(ptr %a) { ; CHECK-LABEL: frintm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frintm z0.h, p0/m, z0.h ; CHECK-NEXT: frintm z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -217,8 +217,8 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) { define void @frintm_v8f32(ptr %a) { ; CHECK-LABEL: frintm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frintm z0.s, p0/m, z0.s ; CHECK-NEXT: frintm z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -254,8 +254,8 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) { define void @frintm_v4f64(ptr %a) { ; CHECK-LABEL: frintm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frintm z0.d, p0/m, z0.d ; CHECK-NEXT: frintm z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -309,8 +309,8 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) { define void @frinti_v16f16(ptr %a) { ; CHECK-LABEL: frinti_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frinti z0.h, p0/m, z0.h ; CHECK-NEXT: frinti z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -348,8 +348,8 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) { define void @frinti_v8f32(ptr %a) { ; CHECK-LABEL: frinti_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frinti z0.s, p0/m, z0.s ; CHECK-NEXT: frinti z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -385,8 +385,8 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) { define void @frinti_v4f64(ptr %a) { ; CHECK-LABEL: frinti_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frinti z0.d, p0/m, z0.d ; CHECK-NEXT: frinti z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -440,8 +440,8 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) { define void @frintx_v16f16(ptr %a) { ; CHECK-LABEL: frintx_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: frintx z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -479,8 +479,8 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) { define void @frintx_v8f32(ptr %a) { ; CHECK-LABEL: frintx_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: frintx z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -516,8 +516,8 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) { define void @frintx_v4f64(ptr %a) { ; CHECK-LABEL: frintx_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: frintx z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -571,8 +571,8 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) { define void @frinta_v16f16(ptr %a) { ; CHECK-LABEL: frinta_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frinta z0.h, p0/m, z0.h ; CHECK-NEXT: frinta z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -610,8 +610,8 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) { define void @frinta_v8f32(ptr %a) { ; CHECK-LABEL: frinta_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frinta z0.s, p0/m, z0.s ; CHECK-NEXT: frinta z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -647,8 +647,8 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) { define void @frinta_v4f64(ptr %a) { ; CHECK-LABEL: frinta_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frinta z0.d, p0/m, z0.d ; CHECK-NEXT: frinta z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -702,8 +702,8 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) { define void @frintn_v16f16(ptr %a) { ; CHECK-LABEL: frintn_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frintn z0.h, p0/m, z0.h ; CHECK-NEXT: frintn z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -741,8 +741,8 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) { define void @frintn_v8f32(ptr %a) { ; CHECK-LABEL: frintn_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frintn z0.s, p0/m, z0.s ; CHECK-NEXT: frintn z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -778,8 +778,8 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) { define void @frintn_v4f64(ptr %a) { ; CHECK-LABEL: frintn_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frintn z0.d, p0/m, z0.d ; CHECK-NEXT: frintn z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -833,8 +833,8 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) { define void @frintz_v16f16(ptr %a) { ; CHECK-LABEL: frintz_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frintz z0.h, p0/m, z0.h ; CHECK-NEXT: frintz z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -872,8 +872,8 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) { define void @frintz_v8f32(ptr %a) { ; CHECK-LABEL: frintz_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frintz z0.s, p0/m, z0.s ; CHECK-NEXT: frintz z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -909,8 +909,8 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) { define void @frintz_v4f64(ptr %a) { ; CHECK-LABEL: frintz_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frintz z0.d, p0/m, z0.d ; CHECK-NEXT: frintz z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll index 73fd7e14653433..132225546fc4fe 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll @@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu" define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) { ; CHECK-LABEL: select_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -23,8 +23,8 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) { define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) { ; CHECK-LABEL: select_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -39,8 +39,8 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) { define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) { ; CHECK-LABEL: select_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -55,8 +55,8 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) { define void @select_v16f16(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z0.h, w2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0x1 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: ldr q0, [x0] @@ -77,8 +77,8 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) { define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) { ; CHECK-LABEL: select_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.s, w8 @@ -93,8 +93,8 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) { define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) { ; CHECK-LABEL: select_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.s, w8 @@ -109,8 +109,8 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) { define void @select_v8f32(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ldr q0, [x0] @@ -150,9 +150,9 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) { ; CHECK-LABEL: select_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.d, x8 @@ -167,9 +167,9 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask define void @select_v4f64(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ldr q0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index d6adf9cf0ad672..58eae212d7999b 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -36,8 +36,8 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) { define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v16f16_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h ; CHECK-NEXT: fcvtzu z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] @@ -55,8 +55,8 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) { define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) { ; CHECK-LABEL: fcvtzu_v2f16_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -68,8 +68,8 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) { define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) { ; CHECK-LABEL: fcvtzu_v4f16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -302,8 +302,8 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) { define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) { ; CHECK-LABEL: fcvtzu_v8f32_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s ; CHECK-NEXT: ptrue p0.h, vl4 @@ -320,8 +320,8 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) { define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v16f32_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s @@ -373,8 +373,8 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) { define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v8f32_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s ; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] @@ -392,8 +392,8 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) { define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) { ; CHECK-LABEL: fcvtzu_v1f32_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -405,8 +405,8 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) { define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) { ; CHECK-LABEL: fcvtzu_v2f32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -491,8 +491,8 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s @@ -520,8 +520,8 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d @@ -563,24 +563,22 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ldr q6, [x0, #112] -; CHECK-NEXT: ldp q4, q5, [x0, #80] -; CHECK-NEXT: ldr q7, [x0, #64] +; CHECK-NEXT: ldp q4, q5, [x0, #96] ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d +; CHECK-NEXT: ldp q6, q7, [x0, #64] ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d -; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d ; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d ; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s ; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z16.s, z1.s[1] @@ -606,25 +604,26 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z3.s, z5.s[1] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z6.s[1] +; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s ; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #20] +; CHECK-NEXT: strh w8, [sp, #24] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #16] +; CHECK-NEXT: strh w8, [sp, #20] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #30] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: strh w8, [sp, #16] ; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #26] +; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w8, [sp, #18] ; CHECK-NEXT: ldp q1, q0, [sp] ; CHECK-NEXT: stp q1, q0, [x1] @@ -669,8 +668,8 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) { define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) { ; CHECK-LABEL: fcvtzu_v4f64_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 @@ -687,8 +686,8 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) { define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v8f64_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d @@ -740,8 +739,8 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) { define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v4f64_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] @@ -785,8 +784,8 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) { define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v16f16_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h ; CHECK-NEXT: fcvtzs z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] @@ -804,8 +803,8 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) { define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) { ; CHECK-LABEL: fcvtzs_v2f16_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -817,8 +816,8 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) { define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) { ; CHECK-LABEL: fcvtzs_v4f16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -1052,8 +1051,8 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) { define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) { ; CHECK-LABEL: fcvtzs_v8f32_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: ptrue p0.h, vl4 @@ -1070,8 +1069,8 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) { define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v16f32_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s @@ -1123,8 +1122,8 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) { define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v8f32_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] @@ -1142,8 +1141,8 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) { define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) { ; CHECK-LABEL: fcvtzs_v1f32_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -1155,8 +1154,8 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) { define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) { ; CHECK-LABEL: fcvtzs_v2f32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -1243,8 +1242,8 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1272,8 +1271,8 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d @@ -1315,24 +1314,22 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ldr q6, [x0, #112] -; CHECK-NEXT: ldp q4, q5, [x0, #80] -; CHECK-NEXT: ldr q7, [x0, #64] +; CHECK-NEXT: ldp q4, q5, [x0, #96] ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d +; CHECK-NEXT: ldp q6, q7, [x0, #64] ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d -; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d ; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d ; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s ; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z16.s, z1.s[1] @@ -1358,25 +1355,26 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z3.s, z5.s[1] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z6.s[1] +; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s ; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #20] +; CHECK-NEXT: strh w8, [sp, #24] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #16] +; CHECK-NEXT: strh w8, [sp, #20] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #30] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: strh w8, [sp, #16] ; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #26] +; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w8, [sp, #18] ; CHECK-NEXT: ldp q1, q0, [sp] ; CHECK-NEXT: stp q1, q0, [x1] @@ -1421,8 +1419,8 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) { define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) { ; CHECK-LABEL: fcvtzs_v4f64_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 @@ -1439,8 +1437,8 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) { define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v8f64_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d @@ -1492,8 +1490,8 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) { define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v4f64_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index ee8704284def5f..4c5a6fe2fd2315 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -71,8 +71,8 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask define void @select_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h @@ -128,8 +128,8 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m define void @select_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s @@ -186,8 +186,8 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> define void @select_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll index 0b3e7695e6a0a5..4aa965777c742d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll @@ -11,9 +11,9 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) { ; CHECK-LABEL: insertelement_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -28,9 +28,9 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) { define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) { ; CHECK-LABEL: insertelement_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -45,9 +45,9 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) { define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) { ; CHECK-LABEL: insertelement_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -62,9 +62,9 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) { define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) { ; CHECK-LABEL: insertelement_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: index z2.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z3.b, w8 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -80,9 +80,9 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) { define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) { ; CHECK-LABEL: insertelement_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -97,9 +97,9 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) { define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) { ; CHECK-LABEL: insertelement_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -114,9 +114,9 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) { define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) { ; CHECK-LABEL: insertelement_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -131,9 +131,9 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) { define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) { ; CHECK-LABEL: insertelement_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z2.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -149,9 +149,9 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) { define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) { ; CHECK-LABEL: insertelement_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -166,9 +166,9 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) { define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) { ; CHECK-LABEL: insertelement_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -183,9 +183,9 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) { define <8 x i32> @insertelement_v8i32(ptr %a) { ; CHECK-LABEL: insertelement_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #5 // =0x5 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s @@ -212,9 +212,9 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) { define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) { ; CHECK-LABEL: insertelement_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -229,9 +229,9 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) { define <4 x i64> @insertelement_v4i64(ptr %a) { ; CHECK-LABEL: insertelement_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov w8, #5 // =0x5 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d @@ -264,9 +264,9 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) { define <4 x half> @insertelement_v4f16(<4 x half> %op1) { ; CHECK-LABEL: insertelement_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h @@ -281,9 +281,9 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) { define <8 x half> @insertelement_v8f16(<8 x half> %op1) { ; CHECK-LABEL: insertelement_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h @@ -298,9 +298,9 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) { define <16 x half> @insertelement_v16f16(ptr %a) { ; CHECK-LABEL: insertelement_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: fmov h2, #5.00000000 ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h @@ -317,9 +317,9 @@ define <16 x half> @insertelement_v16f16(ptr %a) { define <2 x float> @insertelement_v2f32(<2 x float> %op1) { ; CHECK-LABEL: insertelement_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s @@ -334,9 +334,9 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) { define <4 x float> @insertelement_v4f32(<4 x float> %op1) { ; CHECK-LABEL: insertelement_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s @@ -351,9 +351,9 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) { define <8 x float> @insertelement_v8f32(ptr %a) { ; CHECK-LABEL: insertelement_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: fmov s2, #5.00000000 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s @@ -379,9 +379,9 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) { define <2 x double> @insertelement_v2f64(<2 x double> %op1) { ; CHECK-LABEL: insertelement_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d @@ -396,9 +396,9 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) { define <4 x double> @insertelement_v4f64(ptr %a) { ; CHECK-LABEL: insertelement_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: fmov d2, #5.00000000 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll index e3c4b6f1cb53f9..8baa87c6d686de 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -262,8 +262,8 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @mul_v32i8(ptr %a, ptr %b) { ; SVE-LABEL: mul_v32i8: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.b, p0/m, z0.b, z1.b ; SVE-NEXT: movprfx z1, z2 @@ -352,8 +352,8 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @mul_v16i16(ptr %a, ptr %b) { ; SVE-LABEL: mul_v16i16: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.h, p0/m, z0.h, z1.h ; SVE-NEXT: movprfx z1, z2 @@ -421,8 +421,8 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @mul_v8i32(ptr %a, ptr %b) { ; SVE-LABEL: mul_v8i32: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.s, p0/m, z0.s, z1.s ; SVE-NEXT: movprfx z1, z2 @@ -490,8 +490,8 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @mul_v4i64(ptr %a, ptr %b) { ; SVE-LABEL: mul_v4i64: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.d, p0/m, z0.d, z1.d ; SVE-NEXT: movprfx z1, z2 @@ -746,8 +746,8 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) { define void @abs_v32i8(ptr %a) { ; CHECK-LABEL: abs_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: abs z0.b, p0/m, z0.b ; CHECK-NEXT: abs z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -798,8 +798,8 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) { define void @abs_v16i16(ptr %a) { ; CHECK-LABEL: abs_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: abs z0.h, p0/m, z0.h ; CHECK-NEXT: abs z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -837,8 +837,8 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) { define void @abs_v8i32(ptr %a) { ; CHECK-LABEL: abs_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: abs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -876,8 +876,8 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) { define void @abs_v4i64(ptr %a) { ; CHECK-LABEL: abs_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: abs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll index 6200e44218a96c..73c1eac99dd303 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll @@ -41,8 +41,8 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @icmp_eq_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_eq_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b @@ -91,8 +91,8 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @icmp_eq_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_eq_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h @@ -141,8 +141,8 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @icmp_eq_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_eq_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s @@ -191,8 +191,8 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @icmp_eq_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_eq_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d @@ -215,8 +215,8 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) { define void @icmp_ne_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_ne_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpne p1.b, p0/z, z1.b, z0.b ; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, z3.b @@ -261,8 +261,8 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) { define void @icmp_sgt_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_sgt_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpgt p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, z3.h @@ -307,8 +307,8 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) { define void @icmp_slt_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_slt_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpgt p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: cmpgt p0.s, p0/z, z3.s, z2.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index fcf4f21c6ea842..5158dda37a8b9d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -86,8 +86,8 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b @@ -163,18 +163,18 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: sdiv z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h ; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b ; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b ; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret @@ -203,9 +203,9 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-LABEL: sdiv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s @@ -272,8 +272,8 @@ define void @sdiv_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z3.h, p0, z3.h, z1.h ; CHECK-NEXT: stp q3, q0, [x0] @@ -314,8 +314,8 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @sdiv_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: sdiv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -358,8 +358,8 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @sdiv_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: sdiv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: sdivr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -453,8 +453,8 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b @@ -530,18 +530,18 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: udiv z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h ; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b ; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b ; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret @@ -555,9 +555,9 @@ define void @udiv_v32i8(ptr %a, ptr %b) { define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-LABEL: udiv_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: and z1.s, z1.s, #0xffff ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s @@ -570,9 +570,9 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-LABEL: udiv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s @@ -639,8 +639,8 @@ define void @udiv_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z3.h, p0, z3.h, z1.h ; CHECK-NEXT: stp q3, q0, [x0] @@ -681,8 +681,8 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @udiv_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: udiv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -725,8 +725,8 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @udiv_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: udiv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: udivr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -743,10 +743,10 @@ define void @udiv_v4i64(ptr %a, ptr %b) { define void @udiv_constantsplat_v8i32(ptr %a) { ; SVE-LABEL: udiv_constantsplat_v8i32: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: mov w8, #8969 // =0x2309 -; SVE-NEXT: movk w8, #22765, lsl #16 ; SVE-NEXT: ldp q1, q2, [x0] +; SVE-NEXT: movk w8, #22765, lsl #16 +; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: mov z0.s, w8 ; SVE-NEXT: movprfx z3, z1 ; SVE-NEXT: umulh z3.s, p0/m, z3.s, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll index 0785c67ce6f41d..f028b3eeca2571 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll @@ -221,8 +221,8 @@ define void @ashr_v4i64(ptr %a) { define void @icmp_eq_v32i8(ptr %a) { ; CHECK-LABEL: icmp_eq_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, #7 ; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, #7 ; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff @@ -241,8 +241,8 @@ define void @icmp_eq_v32i8(ptr %a) { define void @icmp_sge_v16i16(ptr %a) { ; CHECK-LABEL: icmp_sge_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: cmpge p1.h, p0/z, z0.h, #15 ; CHECK-NEXT: cmpge p0.h, p0/z, z1.h, #15 ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff @@ -261,8 +261,8 @@ define void @icmp_sge_v16i16(ptr %a) { define void @icmp_sgt_v8i32(ptr %a) { ; CHECK-LABEL: icmp_sgt_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: cmpgt p1.s, p0/z, z0.s, #-8 ; CHECK-NEXT: cmpgt p0.s, p0/z, z1.s, #-8 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff @@ -281,8 +281,8 @@ define void @icmp_sgt_v8i32(ptr %a) { define void @icmp_ult_v4i64(ptr %a) { ; CHECK-LABEL: icmp_ult_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cmplo p1.d, p0/z, z0.d, #63 ; CHECK-NEXT: cmplo p0.d, p0/z, z1.d, #63 ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll index d7600c6e6192d9..50cf9b73d9a79c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll @@ -37,8 +37,8 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @smax_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: smax_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -81,8 +81,8 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @smax_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: smax_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -125,8 +125,8 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @smax_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: smax_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -171,8 +171,8 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @smax_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: smax_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -219,8 +219,8 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @smin_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: smin_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -263,8 +263,8 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @smin_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: smin_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -307,8 +307,8 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @smin_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: smin_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -353,8 +353,8 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @smin_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: smin_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -401,8 +401,8 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @umax_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: umax_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -445,8 +445,8 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @umax_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: umax_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -489,8 +489,8 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @umax_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: umax_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -535,8 +535,8 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @umax_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: umax_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -583,8 +583,8 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @umin_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: umin_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -627,8 +627,8 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @umin_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: umin_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -671,8 +671,8 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @umin_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: umin_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -717,8 +717,8 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @umin_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: umin_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll index c48cb315a7aa34..cb7fa53eac5130 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -101,8 +101,8 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @smulh_v32i8(ptr %a, ptr %b) { ; SVE-LABEL: smulh_v32i8: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.b, p0/m, z0.b, z1.b ; SVE-NEXT: movprfx z1, z2 @@ -214,8 +214,8 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @smulh_v16i16(ptr %a, ptr %b) { ; SVE-LABEL: smulh_v16i16: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.h, p0/m, z0.h, z1.h ; SVE-NEXT: movprfx z1, z2 @@ -295,8 +295,8 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @smulh_v8i32(ptr %a, ptr %b) { ; SVE-LABEL: smulh_v8i32: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.s, p0/m, z0.s, z1.s ; SVE-NEXT: movprfx z1, z2 @@ -378,8 +378,8 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @smulh_v4i64(ptr %a, ptr %b) { ; SVE-LABEL: smulh_v4i64: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.d, p0/m, z0.d, z1.d ; SVE-NEXT: movprfx z1, z2 @@ -413,9 +413,9 @@ define void @smulh_v4i64(ptr %a, ptr %b) { define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; SVE-LABEL: umulh_v4i8: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.h, vl4 ; SVE-NEXT: // kill: def $d1 killed $d1 def $z1 ; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: ptrue p0.h, vl4 ; SVE-NEXT: and z0.h, z0.h, #0xff ; SVE-NEXT: and z1.h, z1.h, #0xff ; SVE-NEXT: mul z0.h, p0/m, z0.h, z1.h @@ -494,8 +494,8 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @umulh_v32i8(ptr %a, ptr %b) { ; SVE-LABEL: umulh_v32i8: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.b, p0/m, z0.b, z1.b ; SVE-NEXT: movprfx z1, z2 @@ -525,9 +525,9 @@ define void @umulh_v32i8(ptr %a, ptr %b) { define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; SVE-LABEL: umulh_v2i16: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s, vl2 ; SVE-NEXT: // kill: def $d1 killed $d1 def $z1 ; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: ptrue p0.s, vl2 ; SVE-NEXT: and z0.s, z0.s, #0xffff ; SVE-NEXT: and z1.s, z1.s, #0xffff ; SVE-NEXT: mul z0.s, p0/m, z0.s, z1.s @@ -606,8 +606,8 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @umulh_v16i16(ptr %a, ptr %b) { ; SVE-LABEL: umulh_v16i16: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.h, p0/m, z0.h, z1.h ; SVE-NEXT: movprfx z1, z2 @@ -687,8 +687,8 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @umulh_v8i32(ptr %a, ptr %b) { ; SVE-LABEL: umulh_v8i32: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.s, p0/m, z0.s, z1.s ; SVE-NEXT: movprfx z1, z2 @@ -770,8 +770,8 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @umulh_v4i64(ptr %a, ptr %b) { ; SVE-LABEL: umulh_v4i64: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.d, p0/m, z0.d, z1.d ; SVE-NEXT: movprfx z1, z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll index c51630ecd752a5..751f43768a511a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll @@ -37,8 +37,8 @@ define i8 @uaddv_v16i8(<16 x i8> %a) { define i8 @uaddv_v32i8(ptr %a) { ; CHECK-LABEL: uaddv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: add z0.b, z1.b, z0.b ; CHECK-NEXT: uaddv d0, p0, z0.b ; CHECK-NEXT: fmov x0, d0 @@ -78,8 +78,8 @@ define i16 @uaddv_v8i16(<8 x i16> %a) { define i16 @uaddv_v16i16(ptr %a) { ; CHECK-LABEL: uaddv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: uaddv d0, p0, z0.h ; CHECK-NEXT: fmov x0, d0 @@ -119,8 +119,8 @@ define i32 @uaddv_v4i32(<4 x i32> %a) { define i32 @uaddv_v8i32(ptr %a) { ; CHECK-LABEL: uaddv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEXT: uaddv d0, p0, z0.s ; CHECK-NEXT: fmov x0, d0 @@ -146,8 +146,8 @@ define i64 @uaddv_v2i64(<2 x i64> %a) { define i64 @uaddv_v4i64(ptr %a) { ; CHECK-LABEL: uaddv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: uaddv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -188,8 +188,8 @@ define i8 @smaxv_v16i8(<16 x i8> %a) { define i8 @smaxv_v32i8(ptr %a) { ; CHECK-LABEL: smaxv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: smaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -226,8 +226,8 @@ define i16 @smaxv_v8i16(<8 x i16> %a) { define i16 @smaxv_v16i16(ptr %a) { ; CHECK-LABEL: smaxv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: smaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -264,8 +264,8 @@ define i32 @smaxv_v4i32(<4 x i32> %a) { define i32 @smaxv_v8i32(ptr %a) { ; CHECK-LABEL: smaxv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: smaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -291,8 +291,8 @@ define i64 @smaxv_v2i64(<2 x i64> %a) { define i64 @smaxv_v4i64(ptr %a) { ; CHECK-LABEL: smaxv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: smaxv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -333,8 +333,8 @@ define i8 @sminv_v16i8(<16 x i8> %a) { define i8 @sminv_v32i8(ptr %a) { ; CHECK-LABEL: sminv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: sminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -371,8 +371,8 @@ define i16 @sminv_v8i16(<8 x i16> %a) { define i16 @sminv_v16i16(ptr %a) { ; CHECK-LABEL: sminv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: sminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -409,8 +409,8 @@ define i32 @sminv_v4i32(<4 x i32> %a) { define i32 @sminv_v8i32(ptr %a) { ; CHECK-LABEL: sminv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: sminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -436,8 +436,8 @@ define i64 @sminv_v2i64(<2 x i64> %a) { define i64 @sminv_v4i64(ptr %a) { ; CHECK-LABEL: sminv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: sminv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -478,8 +478,8 @@ define i8 @umaxv_v16i8(<16 x i8> %a) { define i8 @umaxv_v32i8(ptr %a) { ; CHECK-LABEL: umaxv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -516,8 +516,8 @@ define i16 @umaxv_v8i16(<8 x i16> %a) { define i16 @umaxv_v16i16(ptr %a) { ; CHECK-LABEL: umaxv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -554,8 +554,8 @@ define i32 @umaxv_v4i32(<4 x i32> %a) { define i32 @umaxv_v8i32(ptr %a) { ; CHECK-LABEL: umaxv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -581,8 +581,8 @@ define i64 @umaxv_v2i64(<2 x i64> %a) { define i64 @umaxv_v4i64(ptr %a) { ; CHECK-LABEL: umaxv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: umaxv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -623,8 +623,8 @@ define i8 @uminv_v16i8(<16 x i8> %a) { define i8 @uminv_v32i8(ptr %a) { ; CHECK-LABEL: uminv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -661,8 +661,8 @@ define i16 @uminv_v8i16(<8 x i16> %a) { define i16 @uminv_v16i16(ptr %a) { ; CHECK-LABEL: uminv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: uminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -699,8 +699,8 @@ define i32 @uminv_v4i32(<4 x i32> %a) { define i32 @uminv_v8i32(ptr %a) { ; CHECK-LABEL: uminv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: uminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -726,8 +726,8 @@ define i64 @uminv_v2i64(<2 x i64> %a) { define i64 @uminv_v4i64(ptr %a) { ; CHECK-LABEL: uminv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: uminv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 4a1209b942f4a0..d373a9063f8521 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -65,7 +65,6 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z2.h, z2.b @@ -91,15 +90,16 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z5.h, p0, z5.h, z3.h ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b -; CHECK-NEXT: mls z0.b, p1/m, z3.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mls z0.b, p0/m, z3.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = srem <16 x i8> %op1, %op2 @@ -112,7 +112,6 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: sunpklo z7.h, z1.b @@ -171,22 +170,23 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h ; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h ; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h ; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h ; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b ; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b +; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b ; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b -; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mls z0.b, p0/m, z6.b, z1.b ; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b +; CHECK-NEXT: mls z2.b, p0/m, z7.b, z4.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a @@ -199,11 +199,11 @@ define void @srem_v32i8(ptr %a, ptr %b) { define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-LABEL: srem_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h @@ -223,7 +223,6 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z4.s, z0.h -; CHECK-NEXT: ptrue p1.h, vl8 ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z2.s, z2.h @@ -235,7 +234,8 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h -; CHECK-NEXT: mls z0.h, p1/m, z3.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = srem <8 x i16> %op1, %op2 @@ -248,7 +248,6 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q4, q1, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ptrue p1.h, vl8 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: mov z5.d, z4.d @@ -277,9 +276,10 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: splice z6.h, p0, z6.h, z5.h ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: mls z2.h, p1/m, z6.h, z4.h -; CHECK-NEXT: mls z0.h, p1/m, z7.h, z1.h +; CHECK-NEXT: mls z2.h, p0/m, z6.h, z4.h +; CHECK-NEXT: mls z0.h, p0/m, z7.h, z1.h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a @@ -322,8 +322,8 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @srem_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: srem_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z0.s @@ -374,8 +374,8 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @srem_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: srem_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: sdiv z4.d, p0/m, z4.d, z0.d @@ -454,7 +454,6 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z2.h, z2.b @@ -480,15 +479,16 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z5.h, p0, z5.h, z3.h ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b -; CHECK-NEXT: mls z0.b, p1/m, z3.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mls z0.b, p0/m, z3.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = urem <16 x i8> %op1, %op2 @@ -501,7 +501,6 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: uunpklo z7.h, z1.b @@ -560,22 +559,23 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h ; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h ; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h ; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h ; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b ; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b +; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b ; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b -; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mls z0.b, p0/m, z6.b, z1.b ; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b +; CHECK-NEXT: mls z2.b, p0/m, z7.b, z4.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a @@ -588,11 +588,11 @@ define void @urem_v32i8(ptr %a, ptr %b) { define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-LABEL: urem_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h @@ -612,7 +612,6 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z4.s, z0.h -; CHECK-NEXT: ptrue p1.h, vl8 ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z2.s, z2.h @@ -624,7 +623,8 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h -; CHECK-NEXT: mls z0.h, p1/m, z3.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = urem <8 x i16> %op1, %op2 @@ -637,7 +637,6 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q4, q1, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ptrue p1.h, vl8 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: mov z5.d, z4.d @@ -666,9 +665,10 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: splice z6.h, p0, z6.h, z5.h ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: mls z2.h, p1/m, z6.h, z4.h -; CHECK-NEXT: mls z0.h, p1/m, z7.h, z1.h +; CHECK-NEXT: mls z2.h, p0/m, z6.h, z4.h +; CHECK-NEXT: mls z0.h, p0/m, z7.h, z1.h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a @@ -711,8 +711,8 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @urem_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: urem_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z0.s @@ -763,8 +763,8 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @urem_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: urem_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: udiv z4.d, p0/m, z4.d, z0.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll index 3b58e35bd844c5..906112f7ac39e4 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll @@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) { ; CHECK-LABEL: select_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -23,8 +23,8 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) { define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) { ; CHECK-LABEL: select_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0 @@ -38,8 +38,8 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) { define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) { ; CHECK-LABEL: select_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0 @@ -53,8 +53,8 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) { define void @select_v32i8(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z0.b, w2 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x0, #16] @@ -74,8 +74,8 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) { define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) { ; CHECK-LABEL: select_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.s, w8 @@ -90,8 +90,8 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) { define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) { ; CHECK-LABEL: select_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -106,8 +106,8 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) { define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) { ; CHECK-LABEL: select_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -122,8 +122,8 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) { define void @select_v16i16(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z0.h, w2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0x1 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: ldr q0, [x0] @@ -144,8 +144,8 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) { define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) { ; CHECK-LABEL: select_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.s, w8 @@ -160,8 +160,8 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) { define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) { ; CHECK-LABEL: select_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.s, w8 @@ -176,8 +176,8 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) { define void @select_v8i32(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ldr q0, [x0] @@ -198,9 +198,9 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) { define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) { ; CHECK-LABEL: select_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.d, x8 @@ -215,9 +215,9 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) { define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) { ; CHECK-LABEL: select_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.d, x8 @@ -232,9 +232,9 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) { define void @select_v4i64(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ldr q0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll index c7fa0e8ad5e4ad..9ed52e321d9ab1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll @@ -52,8 +52,8 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @ashr_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: ashr_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -111,8 +111,8 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @ashr_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: ashr_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -155,8 +155,8 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @ashr_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: ashr_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -199,8 +199,8 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @ashr_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: ashr_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -221,9 +221,9 @@ define void @ashr_v4i64(ptr %a, ptr %b) { define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: lshr_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h @@ -262,8 +262,8 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @lshr_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: lshr_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -280,9 +280,9 @@ define void @lshr_v32i8(ptr %a, ptr %b) { define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-LABEL: lshr_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: and z1.s, z1.s, #0xffff ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s @@ -321,8 +321,8 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @lshr_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: lshr_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -365,8 +365,8 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @lshr_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: lshr_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -409,8 +409,8 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @lshr_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: lshr_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -431,8 +431,8 @@ define void @lshr_v4i64(ptr %a, ptr %b) { define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) { ; CHECK-LABEL: shl_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: and z1.s, z1.s, #0xff ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s @@ -445,8 +445,8 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) { define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: shl_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h @@ -485,8 +485,8 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @shl_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: shl_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -529,8 +529,8 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @shl_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: shl_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -573,8 +573,8 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @shl_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: shl_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -617,8 +617,8 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @shl_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: shl_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 5c5cf68135bf8d..b285659258f31d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -36,8 +36,8 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) { define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: ucvtf_v16i16_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h ; CHECK-NEXT: ucvtf z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] @@ -55,8 +55,8 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) { define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) { ; CHECK-LABEL: ucvtf_v2i16_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -68,8 +68,8 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) { define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) { ; CHECK-LABEL: ucvtf_v4i16_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -277,8 +277,8 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) { define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) { ; CHECK-LABEL: ucvtf_v8i32_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ucvtf z1.h, p0/m, z1.s ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ptrue p0.h, vl4 @@ -295,8 +295,8 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) { define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: ucvtf_v16i32_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: ucvtf z1.h, p0/m, z1.s ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s @@ -348,8 +348,8 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) { define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: ucvtf_v8i32_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] @@ -367,8 +367,8 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) { define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) { ; CHECK-LABEL: ucvtf_v2i32_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -446,16 +446,16 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) { define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) { ; CHECK-LABEL: ucvtf_v4i64_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: fcvt z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -467,10 +467,9 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) { define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) { ; CHECK-LABEL: ucvtf_v8i64_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d ; CHECK-NEXT: ucvtf z3.s, p0/m, z3.d @@ -482,11 +481,12 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) { ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movprfx z0, z1 -; CHECK-NEXT: fcvt z0.h, p1/m, z1.s +; CHECK-NEXT: fcvt z0.h, p0/m, z1.s ; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fcvt z1.h, p1/m, z2.s +; CHECK-NEXT: fcvt z1.h, p0/m, z2.s +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.h, z1.h, z1.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z2.h @@ -517,8 +517,8 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) { define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) { ; CHECK-LABEL: ucvtf_v4i64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 @@ -535,8 +535,8 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) { define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: ucvtf_v8i64_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d @@ -576,8 +576,8 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) { define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: ucvtf_v4i64_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] @@ -621,8 +621,8 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) { define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: scvtf_v16i16_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.h ; CHECK-NEXT: scvtf z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] @@ -652,8 +652,8 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) { define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) { ; CHECK-LABEL: scvtf_v4i16_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -850,8 +850,8 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) { define <8 x half> @scvtf_v8i32_v8f16(ptr %a) { ; CHECK-LABEL: scvtf_v8i32_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: scvtf z1.h, p0/m, z1.s ; CHECK-NEXT: scvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ptrue p0.h, vl4 @@ -896,8 +896,8 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) { define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: scvtf_v8i32_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: scvtf z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] @@ -915,8 +915,8 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) { define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) { ; CHECK-LABEL: scvtf_v2i32_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -1038,16 +1038,16 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) { define <4 x half> @scvtf_v4i64_v4f16(ptr %a) { ; CHECK-LABEL: scvtf_v4i64_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: scvtf z1.s, p0/m, z1.d ; CHECK-NEXT: scvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: fcvt z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -1076,8 +1076,8 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) { define <4 x float> @scvtf_v4i64_v4f32(ptr %a) { ; CHECK-LABEL: scvtf_v4i64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: scvtf z1.s, p0/m, z1.d ; CHECK-NEXT: scvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 @@ -1110,8 +1110,8 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) { define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: scvtf_v4i64_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: scvtf z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 1809cfcf3db690..81bbaa92d4b471 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -61,8 +61,8 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) define void @select_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b @@ -136,8 +136,8 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) { define void @select_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h @@ -193,8 +193,8 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) { define void @select_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s @@ -213,9 +213,9 @@ define void @select_v8i32(ptr %a, ptr %b) { define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) { ; CHECK-LABEL: select_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.d, x8 @@ -249,8 +249,8 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) { define void @select_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll index bb1bd8fe72b21e..c4aeb4465c5373 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll @@ -48,8 +48,8 @@ define i8 @andv_v16i8(<16 x i8> %a) { define i8 @andv_v32i8(ptr %a) { ; CHECK-LABEL: andv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -98,8 +98,8 @@ define i16 @andv_v8i16(<8 x i16> %a) { define i16 @andv_v16i16(ptr %a) { ; CHECK-LABEL: andv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -136,8 +136,8 @@ define i32 @andv_v4i32(<4 x i32> %a) { define i32 @andv_v8i32(ptr %a) { ; CHECK-LABEL: andv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -162,8 +162,8 @@ define i64 @andv_v2i64(<2 x i64> %a) { define i64 @andv_v4i64(ptr %a) { ; CHECK-LABEL: andv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -216,8 +216,8 @@ define i8 @eorv_v16i8(<16 x i8> %a) { define i8 @eorv_v32i8(ptr %a) { ; CHECK-LABEL: eorv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -266,8 +266,8 @@ define i16 @eorv_v8i16(<8 x i16> %a) { define i16 @eorv_v16i16(ptr %a) { ; CHECK-LABEL: eorv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -304,8 +304,8 @@ define i32 @eorv_v4i32(<4 x i32> %a) { define i32 @eorv_v8i32(ptr %a) { ; CHECK-LABEL: eorv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -330,8 +330,8 @@ define i64 @eorv_v2i64(<2 x i64> %a) { define i64 @eorv_v4i64(ptr %a) { ; CHECK-LABEL: eorv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -384,8 +384,8 @@ define i8 @orv_v16i8(<16 x i8> %a) { define i8 @orv_v32i8(ptr %a) { ; CHECK-LABEL: orv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -434,8 +434,8 @@ define i16 @orv_v8i16(<8 x i16> %a) { define i16 @orv_v16i16(ptr %a) { ; CHECK-LABEL: orv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -472,8 +472,8 @@ define i32 @orv_v4i32(<4 x i32> %a) { define i32 @orv_v8i32(ptr %a) { ; CHECK-LABEL: orv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -498,8 +498,8 @@ define i64 @orv_v2i64(<2 x i64> %a) { define i64 @orv_v4i64(ptr %a) { ; CHECK-LABEL: orv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll index e8127067447452..f2b3f9b12ea718 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -123,8 +123,8 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) { ; CHECK-NEXT: asr z0.b, z0.b, #7 ; CHECK-NEXT: asr z1.b, z1.b, #7 ; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0 -; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: st1b { z0.b }, p1, [x0, x8] ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: add sp, sp, #32 @@ -204,8 +204,8 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) { ; CHECK-NEXT: asr z0.h, z0.h, #15 ; CHECK-NEXT: asr z1.h, z1.h, #15 ; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, #0 -; CHECK-NEXT: mov z1.h, #0 // =0x0 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: mov z1.h, #0 // =0x0 ; CHECK-NEXT: st1h { z1.h }, p1, [x0, x8, lsl #1] ; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret @@ -242,7 +242,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: mov z2.b, z0.b[3] ; CHECK-NEXT: strh w8, [sp, #14] ; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: mov z3.b, z0.b[2] @@ -258,9 +258,9 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; CHECK-NEXT: lsl z1.s, z1.s, #31 ; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 -; CHECK-NEXT: mov z1.b, z0.b[3] -; CHECK-NEXT: st1w { z2.s }, p1, [x0, x8, lsl #2] -; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w9, [sp] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s3 @@ -272,7 +272,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; CHECK-NEXT: lsl z0.s, z0.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: st1w { z2.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) @@ -310,8 +310,8 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) { ; CHECK-NEXT: asr z1.d, z1.d, #63 ; CHECK-NEXT: asr z0.d, z0.d, #63 ; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll index f0b0b3269e98ff..6fcb95f2833388 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll @@ -170,8 +170,8 @@ define void @abs_v4i32(ptr %a) { define void @abs_v8i32(ptr %a) { ; CHECK-LABEL: abs_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: abs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -199,8 +199,8 @@ define void @abs_v2i64(ptr %a) { define void @abs_v4i64(ptr %a) { ; CHECK-LABEL: abs_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: abs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -263,8 +263,8 @@ define void @fadd_v8f16(ptr %a, ptr %b) { define void @fadd_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -313,8 +313,8 @@ define void @fadd_v4f32(ptr %a, ptr %b) { define void @fadd_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -347,8 +347,8 @@ define void @fadd_v2f64(ptr %a, ptr %b) { define void @fadd_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll index d1bff4fa21a113..00413302798ca1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll @@ -9,8 +9,8 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_revbv16i16(ptr %a) { ; CHECK-LABEL: test_revbv16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: revb z0.h, p0/m, z0.h ; CHECK-NEXT: revb z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -25,8 +25,8 @@ define void @test_revbv16i16(ptr %a) { define void @test_revbv8i32(ptr %a) { ; CHECK-LABEL: test_revbv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: revb z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -41,8 +41,8 @@ define void @test_revbv8i32(ptr %a) { define void @test_revbv4i64(ptr %a) { ; CHECK-LABEL: test_revbv4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: revb z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -57,8 +57,8 @@ define void @test_revbv4i64(ptr %a) { define void @test_revhv8i32(ptr %a) { ; CHECK-LABEL: test_revhv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: revh z0.s, p0/m, z0.s ; CHECK-NEXT: revh z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -73,8 +73,8 @@ define void @test_revhv8i32(ptr %a) { define void @test_revhv8f32(ptr %a) { ; CHECK-LABEL: test_revhv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: revh z0.s, p0/m, z0.s ; CHECK-NEXT: revh z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -89,8 +89,8 @@ define void @test_revhv8f32(ptr %a) { define void @test_revhv4i64(ptr %a) { ; CHECK-LABEL: test_revhv4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revh z0.d, p0/m, z0.d ; CHECK-NEXT: revh z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -105,8 +105,8 @@ define void @test_revhv4i64(ptr %a) { define void @test_revwv4i64(ptr %a) { ; CHECK-LABEL: test_revwv4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revw z0.d, p0/m, z0.d ; CHECK-NEXT: revw z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -121,8 +121,8 @@ define void @test_revwv4i64(ptr %a) { define void @test_revwv4f64(ptr %a) { ; CHECK-LABEL: test_revwv4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revw z0.d, p0/m, z0.d ; CHECK-NEXT: revw z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -150,8 +150,8 @@ define <16 x i8> @test_revv16i8(ptr %a) { define void @test_revwv8i32v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: test_revwv8i32v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revw z0.d, p0/m, z0.d ; CHECK-NEXT: revw z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -166,8 +166,8 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) { define void @test_revhv32i16(ptr %a) { ; CHECK-LABEL: test_revhv32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: revh z0.d, p0/m, z0.d ; CHECK-NEXT: revh z1.d, p0/m, z1.d @@ -202,8 +202,8 @@ define void @test_rev_elts_fail(ptr %a) { define void @test_revdv4i64_sve2p1(ptr %a) #1 { ; CHECK-LABEL: test_revdv4i64_sve2p1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: revd z0.q, p0/m, z0.q ; CHECK-NEXT: revd z1.q, p0/m, z1.q ; CHECK-NEXT: stp q0, q1, [x0] @@ -217,8 +217,8 @@ define void @test_revdv4i64_sve2p1(ptr %a) #1 { define void @test_revdv4f64_sve2p1(ptr %a) #1 { ; CHECK-LABEL: test_revdv4f64_sve2p1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revd z0.q, p0/m, z0.q ; CHECK-NEXT: revd z1.q, p0/m, z1.q ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index d7bfb6b2680e13..cb73030306b023 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -82,11 +82,11 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: ldp q0, q4, [x0] -; CHECK-NEXT: ldp q2, q6, [x0, #32] +; CHECK-NEXT: ldp q2, q5, [x0, #32] ; CHECK-NEXT: mov z16.h, z3.h[7] ; CHECK-NEXT: mov z18.h, z3.h[6] ; CHECK-NEXT: mov z17.h, z4.h[7] -; CHECK-NEXT: ldp q5, q7, [x1, #32] +; CHECK-NEXT: ldp q6, q7, [x1, #32] ; CHECK-NEXT: mov z19.h, z4.h[6] ; CHECK-NEXT: fmov w8, s16 ; CHECK-NEXT: mov z16.h, z3.h[5] @@ -98,13 +98,13 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z18.h, z3.h[4] ; CHECK-NEXT: strh w9, [sp, #28] ; CHECK-NEXT: fmov w9, s19 -; CHECK-NEXT: mov z19.h, z6.h[7] +; CHECK-NEXT: mov z19.h, z5.h[7] ; CHECK-NEXT: zip1 z3.h, z4.h, z3.h ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s16 ; CHECK-NEXT: mov z16.h, z4.h[4] ; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: zip1 z4.h, z6.h, z7.h +; CHECK-NEXT: zip1 z4.h, z5.h, z7.h ; CHECK-NEXT: strh w8, [sp, #22] ; CHECK-NEXT: fmov w8, s17 ; CHECK-NEXT: mov z17.h, z1.h[7] @@ -131,7 +131,7 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z18.h, z0.h[4] ; CHECK-NEXT: zip1 z0.h, z0.h, z1.h -; CHECK-NEXT: zip1 z1.h, z2.h, z5.h +; CHECK-NEXT: zip1 z1.h, z2.h, z6.h ; CHECK-NEXT: strh w8, [sp, #54] ; CHECK-NEXT: fmov w8, s16 ; CHECK-NEXT: ldr q16, [sp, #16] @@ -143,41 +143,41 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z18.h, z7.h[7] ; CHECK-NEXT: strh w8, [sp, #48] ; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z6.h[6] +; CHECK-NEXT: mov z18.h, z5.h[6] ; CHECK-NEXT: ldr q17, [sp, #48] ; CHECK-NEXT: strh w8, [sp, #46] ; CHECK-NEXT: fmov w8, s19 ; CHECK-NEXT: mov z19.h, z7.h[5] ; CHECK-NEXT: strh w8, [sp, #44] ; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z6.h[5] +; CHECK-NEXT: mov z20.h, z5.h[5] ; CHECK-NEXT: strh w8, [sp, #42] ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z18.h, z7.h[4] ; CHECK-NEXT: strh w8, [sp, #40] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z6.h[4] +; CHECK-NEXT: mov z19.h, z5.h[4] ; CHECK-NEXT: strh w8, [sp, #38] ; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z5.h[7] +; CHECK-NEXT: mov z20.h, z6.h[7] ; CHECK-NEXT: strh w8, [sp, #36] ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z18.h, z2.h[7] ; CHECK-NEXT: strh w8, [sp, #34] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z5.h[6] +; CHECK-NEXT: mov z19.h, z6.h[6] ; CHECK-NEXT: strh w8, [sp, #32] ; CHECK-NEXT: fmov w8, s20 ; CHECK-NEXT: mov z20.h, z2.h[6] ; CHECK-NEXT: strh w8, [sp, #14] ; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z5.h[5] +; CHECK-NEXT: mov z18.h, z6.h[5] ; CHECK-NEXT: strh w8, [sp, #12] ; CHECK-NEXT: fmov w8, s19 ; CHECK-NEXT: mov z19.h, z2.h[5] ; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z5.h[4] +; CHECK-NEXT: mov z20.h, z6.h[4] ; CHECK-NEXT: fmov w9, s19 ; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: fmov w8, s18 @@ -186,10 +186,10 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q2, [sp, #32] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: fmov w9, s18 ; CHECK-NEXT: add z2.h, z16.h, z2.h ; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: strh w8, [sp] +; CHECK-NEXT: strh w9, [sp] ; CHECK-NEXT: ldr q4, [sp] ; CHECK-NEXT: stp q3, q2, [x0, #32] ; CHECK-NEXT: add z1.h, z17.h, z4.h @@ -471,9 +471,9 @@ define void @trn_v4f64(ptr %a, ptr %b) { define void @trn_v4f32(ptr %a, ptr %b) { ; CHECK-LABEL: trn_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: trn1 z2.s, z0.s, z1.s ; CHECK-NEXT: trn2 z0.s, z0.s, z1.s ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll index f2ba4a7cc35671..ab7c42b3e9e37d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll @@ -8,8 +8,8 @@ target triple = "aarch64-unknown-linux-gnu" define i1 @ptest_v16i1(ptr %a, ptr %b) { ; CHECK-LABEL: ptest_v16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: fcmne p2.s, p0/z, z1.s, #0.0 @@ -20,7 +20,6 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h @@ -31,7 +30,8 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b ; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b -; CHECK-NEXT: umaxv b0, p1, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: umaxv b0, p0, z1.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -45,41 +45,41 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { define i1 @ptest_or_v16i1(ptr %a, ptr %b) { ; CHECK-LABEL: ptest_or_v16i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q4, q5, [x1] -; CHECK-NEXT: ldp q6, q7, [x1, #32] -; CHECK-NEXT: fcmne p1.s, p0/z, z3.s, #0.0 -; CHECK-NEXT: fcmne p2.s, p0/z, z2.s, #0.0 -; CHECK-NEXT: fcmne p3.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: fcmne p4.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: fcmne p5.s, p0/z, z7.s, #0.0 -; CHECK-NEXT: fcmne p6.s, p0/z, z6.s, #0.0 -; CHECK-NEXT: fcmne p7.s, p0/z, z5.s, #0.0 -; CHECK-NEXT: fcmne p0.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q4, q5, [x1, #32] +; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmne p2.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 +; CHECK-NEXT: fcmne p4.s, p0/z, z2.s, #0.0 +; CHECK-NEXT: fcmne p5.s, p0/z, z5.s, #0.0 +; CHECK-NEXT: fcmne p6.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: fcmne p7.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p4/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z4.s, p5/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z5.s, p6/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: mov z6.s, p7/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h +; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h +; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h -; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h -; CHECK-NEXT: splice z5.h, p1, z5.h, z4.h -; CHECK-NEXT: splice z7.h, p1, z7.h, z6.h ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b ; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b @@ -112,41 +112,41 @@ declare i1 @llvm.vector.reduce.or.i1.v16i1(<16 x i1>) define i1 @ptest_and_v16i1(ptr %a, ptr %b) { ; CHECK-LABEL: ptest_and_v16i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q4, q5, [x1] -; CHECK-NEXT: ldp q6, q7, [x1, #32] -; CHECK-NEXT: fcmne p1.s, p0/z, z3.s, #0.0 -; CHECK-NEXT: fcmne p2.s, p0/z, z2.s, #0.0 -; CHECK-NEXT: fcmne p3.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: fcmne p4.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: fcmne p5.s, p0/z, z7.s, #0.0 -; CHECK-NEXT: fcmne p6.s, p0/z, z6.s, #0.0 -; CHECK-NEXT: fcmne p7.s, p0/z, z5.s, #0.0 -; CHECK-NEXT: fcmne p0.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q4, q5, [x1, #32] +; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmne p2.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 +; CHECK-NEXT: fcmne p4.s, p0/z, z2.s, #0.0 +; CHECK-NEXT: fcmne p5.s, p0/z, z5.s, #0.0 +; CHECK-NEXT: fcmne p6.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: fcmne p7.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p4/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z4.s, p5/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z5.s, p6/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: mov z6.s, p7/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h +; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h +; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h -; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h -; CHECK-NEXT: splice z5.h, p1, z5.h, z4.h -; CHECK-NEXT: splice z7.h, p1, z7.h, z6.h ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b ; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll index f686efff67b669..bfa931044bc531 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll @@ -49,8 +49,8 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) { define void @bitreverse_v32i8(ptr %a) { ; CHECK-LABEL: bitreverse_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b ; CHECK-NEXT: rbit z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -101,8 +101,8 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) { define void @bitreverse_v16i16(ptr %a) { ; CHECK-LABEL: bitreverse_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h ; CHECK-NEXT: rbit z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -140,8 +140,8 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) { define void @bitreverse_v8i32(ptr %a) { ; CHECK-LABEL: bitreverse_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s ; CHECK-NEXT: rbit z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -179,8 +179,8 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) { define void @bitreverse_v4i64(ptr %a) { ; CHECK-LABEL: bitreverse_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d ; CHECK-NEXT: rbit z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -235,8 +235,8 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) { define void @bswap_v16i16(ptr %a) { ; CHECK-LABEL: bswap_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: revb z0.h, p0/m, z0.h ; CHECK-NEXT: revb z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -274,8 +274,8 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) { define void @bswap_v8i32(ptr %a) { ; CHECK-LABEL: bswap_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: revb z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -313,8 +313,8 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) { define void @bswap_v4i64(ptr %a) { ; CHECK-LABEL: bswap_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: revb z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll index 76bb465774d5b2..9dd42e7831e0d0 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll @@ -45,8 +45,8 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) { define void @sdiv_v32i8(ptr %a) { ; CHECK-LABEL: sdiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 ; CHECK-NEXT: asrd z1.b, p0/m, z1.b, #5 ; CHECK-NEXT: stp q0, q1, [x0] @@ -97,8 +97,8 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) { define void @sdiv_v16i16(ptr %a) { ; CHECK-LABEL: sdiv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 ; CHECK-NEXT: asrd z1.h, p0/m, z1.h, #5 ; CHECK-NEXT: stp q0, q1, [x0] @@ -136,8 +136,8 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) { define void @sdiv_v8i32(ptr %a) { ; CHECK-LABEL: sdiv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 ; CHECK-NEXT: asrd z1.s, p0/m, z1.s, #5 ; CHECK-NEXT: stp q0, q1, [x0] @@ -176,8 +176,8 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) { define void @sdiv_v4i64(ptr %a) { ; CHECK-LABEL: sdiv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 ; CHECK-NEXT: asrd z1.d, p0/m, z1.d, #5 ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll index ff1f8699b91afd..ad0d4ef0afef36 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll @@ -37,9 +37,9 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2 define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2) { ; CHECK-LABEL: interleave_store_legalization: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: // kill: def $q3 killed $q3 def $z2_z3 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov x8, #8 // =0x8 ; CHECK-NEXT: mov z4.d, z0.d ; CHECK-NEXT: mov z2.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll index 367ccbeeea81ed..06709ca3685c8e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll @@ -8,8 +8,8 @@ target triple = "aarch64-unknown-linux-gnu" define void @store_v4i8(ptr %a) { ; CHECK-LABEL: store_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: st1b { z0.h }, p0, [x0] ; CHECK-NEXT: ret store <4 x i8> zeroinitializer, ptr %a @@ -49,8 +49,8 @@ define void @store_v32i8(ptr %a) { define void @store_v2i16(ptr %a) { ; CHECK-LABEL: store_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret store <2 x i16> zeroinitializer, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll index 4fef6783140193..70219dd30f7699 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll @@ -87,51 +87,51 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind { define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v128i16_v128i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #192] +; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: ldp q6, q7, [x0, #224] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z7.b, z7.b, z7.b -; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q6, q7, [x0, #64] +; CHECK-NEXT: ldp q16, q17, [x0, #224] ; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b -; CHECK-NEXT: ldp q16, q17, [x0, #64] ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: ldp q18, q19, [x0, #128] -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: ldp q20, q21, [x0, #160] -; CHECK-NEXT: splice z6.b, p0, z6.b, z7.b -; CHECK-NEXT: ldp q22, q23, [x0, #96] -; CHECK-NEXT: uzp1 z1.b, z17.b, z17.b -; CHECK-NEXT: uzp1 z19.b, z19.b, z19.b -; CHECK-NEXT: uzp1 z18.b, z18.b, z18.b +; CHECK-NEXT: uzp1 z7.b, z7.b, z7.b +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: uzp1 z17.b, z17.b, z17.b +; CHECK-NEXT: ldp q4, q5, [x0, #96] ; CHECK-NEXT: uzp1 z16.b, z16.b, z16.b -; CHECK-NEXT: uzp1 z21.b, z21.b, z21.b +; CHECK-NEXT: ldp q18, q19, [x0, #128] +; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b +; CHECK-NEXT: uzp1 z3.b, z21.b, z21.b ; CHECK-NEXT: uzp1 z20.b, z20.b, z20.b -; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z7.b, z23.b, z23.b -; CHECK-NEXT: uzp1 z17.b, z22.b, z22.b +; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b +; CHECK-NEXT: ldp q21, q22, [x0] +; CHECK-NEXT: splice z16.b, p0, z16.b, z17.b +; CHECK-NEXT: uzp1 z19.b, z19.b, z19.b +; CHECK-NEXT: uzp1 z18.b, z18.b, z18.b ; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b -; CHECK-NEXT: add z0.b, z0.b, z0.b +; CHECK-NEXT: splice z20.b, p0, z20.b, z3.b +; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b +; CHECK-NEXT: splice z6.b, p0, z6.b, z7.b +; CHECK-NEXT: uzp1 z5.b, z22.b, z22.b +; CHECK-NEXT: uzp1 z7.b, z21.b, z21.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: splice z18.b, p0, z18.b, z19.b -; CHECK-NEXT: splice z16.b, p0, z16.b, z1.b -; CHECK-NEXT: add z1.b, z6.b, z6.b -; CHECK-NEXT: splice z20.b, p0, z20.b, z21.b -; CHECK-NEXT: splice z17.b, p0, z17.b, z7.b -; CHECK-NEXT: splice z4.b, p0, z4.b, z5.b -; CHECK-NEXT: stp q0, q1, [x1, #96] ; CHECK-NEXT: add z2.b, z2.b, z2.b +; CHECK-NEXT: splice z4.b, p0, z4.b, z3.b +; CHECK-NEXT: add z3.b, z16.b, z16.b +; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: add z1.b, z20.b, z20.b ; CHECK-NEXT: add z5.b, z18.b, z18.b -; CHECK-NEXT: add z0.b, z16.b, z16.b -; CHECK-NEXT: add z3.b, z20.b, z20.b -; CHECK-NEXT: add z1.b, z17.b, z17.b -; CHECK-NEXT: add z4.b, z4.b, z4.b -; CHECK-NEXT: stp q5, q3, [x1, #64] -; CHECK-NEXT: stp q4, q2, [x1] -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q2, q3, [x1, #96] +; CHECK-NEXT: add z2.b, z6.b, z6.b +; CHECK-NEXT: add z3.b, z4.b, z4.b +; CHECK-NEXT: add z4.b, z7.b, z7.b +; CHECK-NEXT: add z0.b, z0.b, z0.b +; CHECK-NEXT: stp q5, q1, [x1, #64] +; CHECK-NEXT: stp q2, q3, [x1, #32] +; CHECK-NEXT: stp q4, q0, [x1] ; CHECK-NEXT: ret %a = load <128 x i16>, ptr %in %b = trunc <128 x i16> %a to <128 x i8> @@ -226,55 +226,55 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind { define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v64i32_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #64] -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: ldp q2, q3, [x0, #160] -; CHECK-NEXT: ptrue p1.b, vl8 -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: ldp q6, q7, [x0] -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: ldp q16, q17, [x0, #128] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldp q4, q5, [x0, #128] +; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: ldp q6, q7, [x0, #96] ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: ldp q18, q19, [x0, #192] ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: ldp q20, q21, [x0, #224] +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: ldp q16, q17, [x0] +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ldp q18, q19, [x0, #192] ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: ldp q20, q21, [x0, #224] +; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h ; CHECK-NEXT: ldp q22, q23, [x0, #32] -; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h -; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h -; CHECK-NEXT: uzp1 z21.h, z21.h, z21.h -; CHECK-NEXT: uzp1 z20.h, z20.h, z20.h +; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z3.h, z21.h, z21.h +; CHECK-NEXT: uzp1 z5.h, z20.h, z20.h +; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z20.h, z23.h, z23.h +; CHECK-NEXT: uzp1 z21.h, z22.h, z22.h ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z23.h, z23.h, z23.h -; CHECK-NEXT: uzp1 z22.h, z22.h, z22.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: splice z18.h, p0, z18.h, z19.h -; CHECK-NEXT: splice z20.h, p0, z20.h, z21.h +; CHECK-NEXT: splice z5.h, p0, z5.h, z3.h +; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h +; CHECK-NEXT: splice z21.h, p0, z21.h, z20.h ; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: splice z22.h, p0, z22.h, z23.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z2.b, z16.b, z16.b -; CHECK-NEXT: uzp1 z5.b, z18.b, z18.b -; CHECK-NEXT: uzp1 z3.b, z20.b, z20.b -; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: uzp1 z7.b, z22.b, z22.b -; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z4.b, z18.b, z18.b +; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b +; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b +; CHECK-NEXT: uzp1 z5.b, z21.b, z21.b +; CHECK-NEXT: splice z2.b, p0, z2.b, z1.b +; CHECK-NEXT: uzp1 z1.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: splice z2.b, p1, z2.b, z1.b -; CHECK-NEXT: splice z5.b, p1, z5.b, z3.b -; CHECK-NEXT: splice z6.b, p1, z6.b, z7.b -; CHECK-NEXT: splice z0.b, p1, z0.b, z4.b +; CHECK-NEXT: splice z4.b, p0, z4.b, z3.b +; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: add z1.b, z2.b, z2.b -; CHECK-NEXT: add z2.b, z5.b, z5.b -; CHECK-NEXT: add z3.b, z6.b, z6.b +; CHECK-NEXT: add z2.b, z4.b, z4.b +; CHECK-NEXT: add z3.b, z7.b, z7.b ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: stp q1, q2, [x1, #32] ; CHECK-NEXT: stp q3, q0, [x1] @@ -368,51 +368,51 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind { define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v64i32_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #192] +; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ldp q6, q7, [x0, #224] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q6, q7, [x0, #64] +; CHECK-NEXT: ldp q16, q17, [x0, #224] ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: ldp q16, q17, [x0, #64] ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: ldp q18, q19, [x0, #128] -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ldp q20, q21, [x0, #160] -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: ldp q22, q23, [x0, #96] -; CHECK-NEXT: uzp1 z1.h, z17.h, z17.h -; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h -; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h +; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h +; CHECK-NEXT: ldp q4, q5, [x0, #96] ; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h -; CHECK-NEXT: uzp1 z21.h, z21.h, z21.h +; CHECK-NEXT: ldp q18, q19, [x0, #128] +; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z3.h, z21.h, z21.h ; CHECK-NEXT: uzp1 z20.h, z20.h, z20.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z7.h, z23.h, z23.h -; CHECK-NEXT: uzp1 z17.h, z22.h, z22.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: ldp q21, q22, [x0] +; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h +; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h +; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: add z0.h, z0.h, z0.h +; CHECK-NEXT: splice z20.h, p0, z20.h, z3.h +; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h +; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: uzp1 z5.h, z22.h, z22.h +; CHECK-NEXT: uzp1 z7.h, z21.h, z21.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: splice z18.h, p0, z18.h, z19.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z1.h -; CHECK-NEXT: add z1.h, z6.h, z6.h -; CHECK-NEXT: splice z20.h, p0, z20.h, z21.h -; CHECK-NEXT: splice z17.h, p0, z17.h, z7.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h -; CHECK-NEXT: stp q0, q1, [x1, #96] ; CHECK-NEXT: add z2.h, z2.h, z2.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z3.h +; CHECK-NEXT: add z3.h, z16.h, z16.h +; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h +; CHECK-NEXT: add z1.h, z20.h, z20.h ; CHECK-NEXT: add z5.h, z18.h, z18.h -; CHECK-NEXT: add z0.h, z16.h, z16.h -; CHECK-NEXT: add z3.h, z20.h, z20.h -; CHECK-NEXT: add z1.h, z17.h, z17.h -; CHECK-NEXT: add z4.h, z4.h, z4.h -; CHECK-NEXT: stp q5, q3, [x1, #64] -; CHECK-NEXT: stp q4, q2, [x1] -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q2, q3, [x1, #96] +; CHECK-NEXT: add z2.h, z6.h, z6.h +; CHECK-NEXT: add z3.h, z4.h, z4.h +; CHECK-NEXT: add z4.h, z7.h, z7.h +; CHECK-NEXT: add z0.h, z0.h, z0.h +; CHECK-NEXT: stp q5, q1, [x1, #64] +; CHECK-NEXT: stp q2, q3, [x1, #32] +; CHECK-NEXT: stp q4, q0, [x1] ; CHECK-NEXT: ret %a = load <64 x i32>, ptr %in %b = trunc <64 x i32> %a to <64 x i16> @@ -535,19 +535,19 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s ; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s -; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s ; CHECK-NEXT: splice z20.s, p0, z20.s, z21.s -; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s +; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s ; CHECK-NEXT: splice z22.s, p0, z22.s, z23.s +; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s ; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z2.h, z16.h, z16.h -; CHECK-NEXT: uzp1 z5.h, z18.h, z18.h ; CHECK-NEXT: uzp1 z3.h, z20.h, z20.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z5.h, z18.h, z18.h ; CHECK-NEXT: uzp1 z7.h, z22.h, z22.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z1.h @@ -658,55 +658,55 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind { define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v32i64_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #64] -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: ldp q2, q3, [x0, #160] -; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: ldp q6, q7, [x0] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: ldp q16, q17, [x0, #128] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldp q4, q5, [x0, #128] +; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: ldp q6, q7, [x0, #96] ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: ldp q18, q19, [x0, #192] ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: ldp q20, q21, [x0, #224] +; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s +; CHECK-NEXT: ldp q16, q17, [x0] +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ldp q18, q19, [x0, #192] ; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s +; CHECK-NEXT: ldp q20, q21, [x0, #224] +; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s ; CHECK-NEXT: ldp q22, q23, [x0, #32] -; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s -; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s +; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s ; CHECK-NEXT: uzp1 z19.s, z19.s, z19.s ; CHECK-NEXT: uzp1 z18.s, z18.s, z18.s -; CHECK-NEXT: uzp1 z21.s, z21.s, z21.s -; CHECK-NEXT: uzp1 z20.s, z20.s, z20.s +; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s +; CHECK-NEXT: uzp1 z3.s, z21.s, z21.s +; CHECK-NEXT: uzp1 z5.s, z20.s, z20.s +; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s +; CHECK-NEXT: uzp1 z20.s, z23.s, z23.s +; CHECK-NEXT: uzp1 z21.s, z22.s, z22.s ; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: uzp1 z23.s, z23.s, z23.s -; CHECK-NEXT: uzp1 z22.s, z22.s, z22.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s ; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s -; CHECK-NEXT: splice z20.s, p0, z20.s, z21.s +; CHECK-NEXT: splice z5.s, p0, z5.s, z3.s +; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s +; CHECK-NEXT: splice z21.s, p0, z21.s, z20.s ; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s -; CHECK-NEXT: splice z22.s, p0, z22.s, z23.s -; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z2.h, z16.h, z16.h -; CHECK-NEXT: uzp1 z5.h, z18.h, z18.h -; CHECK-NEXT: uzp1 z3.h, z20.h, z20.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z7.h, z22.h, z22.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z4.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uzp1 z4.h, z18.h, z18.h +; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z5.h, z21.h, z21.h +; CHECK-NEXT: splice z2.h, p0, z2.h, z1.h +; CHECK-NEXT: uzp1 z1.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z2.h, p1, z2.h, z1.h -; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h -; CHECK-NEXT: splice z6.h, p1, z6.h, z7.h -; CHECK-NEXT: splice z0.h, p1, z0.h, z4.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z3.h +; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: add z1.h, z2.h, z2.h -; CHECK-NEXT: add z2.h, z5.h, z5.h -; CHECK-NEXT: add z3.h, z6.h, z6.h +; CHECK-NEXT: add z2.h, z4.h, z4.h +; CHECK-NEXT: add z3.h, z7.h, z7.h ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: stp q1, q2, [x1, #32] ; CHECK-NEXT: stp q3, q0, [x1] @@ -800,51 +800,51 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind { define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v32i64_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #192] +; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldp q6, q7, [x0, #224] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q6, q7, [x0, #64] +; CHECK-NEXT: ldp q16, q17, [x0, #224] ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: ldp q16, q17, [x0, #64] ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: ldp q18, q19, [x0, #128] -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ldp q20, q21, [x0, #160] -; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s -; CHECK-NEXT: ldp q22, q23, [x0, #96] -; CHECK-NEXT: uzp1 z1.s, z17.s, z17.s -; CHECK-NEXT: uzp1 z19.s, z19.s, z19.s -; CHECK-NEXT: uzp1 z18.s, z18.s, z18.s +; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s +; CHECK-NEXT: ldp q4, q5, [x0, #96] ; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s -; CHECK-NEXT: uzp1 z21.s, z21.s, z21.s +; CHECK-NEXT: ldp q18, q19, [x0, #128] +; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s +; CHECK-NEXT: uzp1 z3.s, z21.s, z21.s ; CHECK-NEXT: uzp1 z20.s, z20.s, z20.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: uzp1 z7.s, z23.s, z23.s -; CHECK-NEXT: uzp1 z17.s, z22.s, z22.s +; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s +; CHECK-NEXT: ldp q21, q22, [x0] +; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s +; CHECK-NEXT: uzp1 z19.s, z19.s, z19.s +; CHECK-NEXT: uzp1 z18.s, z18.s, z18.s ; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: add z0.s, z0.s, z0.s +; CHECK-NEXT: splice z20.s, p0, z20.s, z3.s +; CHECK-NEXT: uzp1 z3.s, z5.s, z5.s +; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s +; CHECK-NEXT: uzp1 z5.s, z22.s, z22.s +; CHECK-NEXT: uzp1 z7.s, z21.s, z21.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s -; CHECK-NEXT: splice z16.s, p0, z16.s, z1.s -; CHECK-NEXT: add z1.s, z6.s, z6.s -; CHECK-NEXT: splice z20.s, p0, z20.s, z21.s -; CHECK-NEXT: splice z17.s, p0, z17.s, z7.s -; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s -; CHECK-NEXT: stp q0, q1, [x1, #96] ; CHECK-NEXT: add z2.s, z2.s, z2.s +; CHECK-NEXT: splice z4.s, p0, z4.s, z3.s +; CHECK-NEXT: add z3.s, z16.s, z16.s +; CHECK-NEXT: splice z7.s, p0, z7.s, z5.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s +; CHECK-NEXT: add z1.s, z20.s, z20.s ; CHECK-NEXT: add z5.s, z18.s, z18.s -; CHECK-NEXT: add z0.s, z16.s, z16.s -; CHECK-NEXT: add z3.s, z20.s, z20.s -; CHECK-NEXT: add z1.s, z17.s, z17.s -; CHECK-NEXT: add z4.s, z4.s, z4.s -; CHECK-NEXT: stp q5, q3, [x1, #64] -; CHECK-NEXT: stp q4, q2, [x1] -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q2, q3, [x1, #96] +; CHECK-NEXT: add z2.s, z6.s, z6.s +; CHECK-NEXT: add z3.s, z4.s, z4.s +; CHECK-NEXT: add z4.s, z7.s, z7.s +; CHECK-NEXT: add z0.s, z0.s, z0.s +; CHECK-NEXT: stp q5, q1, [x1, #64] +; CHECK-NEXT: stp q2, q3, [x1, #32] +; CHECK-NEXT: stp q4, q0, [x1] ; CHECK-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll index 92dfc739613627..0ec6538947c73f 100644 --- a/llvm/test/CodeGen/AArch64/sve-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll @@ -61,8 +61,8 @@ entry: define @trunc_i64toi1( %in) { ; CHECK-LABEL: trunc_i64toi1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ret entry: @@ -73,9 +73,9 @@ entry: define @trunc_i64toi1_split( %in) { ; CHECK-LABEL: trunc_i64toi1_split: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0x1 ; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s @@ -88,11 +88,11 @@ entry: define @trunc_i64toi1_split2( %in) { ; CHECK-LABEL: trunc_i64toi1_split2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z3.d, z3.d, #0x1 ; CHECK-NEXT: and z2.d, z2.d, #0x1 ; CHECK-NEXT: and z1.d, z1.d, #0x1 ; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0 ; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0 ; CHECK-NEXT: cmpne p3.d, p0/z, z1.d, #0 @@ -111,12 +111,12 @@ define @trunc_i64toi1_split3( %in) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z7.d, z7.d, #0x1 ; CHECK-NEXT: and z6.d, z6.d, #0x1 ; CHECK-NEXT: and z5.d, z5.d, #0x1 @@ -125,23 +125,25 @@ define @trunc_i64toi1_split3( %in) { ; CHECK-NEXT: and z2.d, z2.d, #0x1 ; CHECK-NEXT: and z1.d, z1.d, #0x1 ; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpne p1.d, p0/z, z7.d, #0 ; CHECK-NEXT: cmpne p2.d, p0/z, z6.d, #0 ; CHECK-NEXT: cmpne p3.d, p0/z, z5.d, #0 ; CHECK-NEXT: cmpne p4.d, p0/z, z4.d, #0 ; CHECK-NEXT: cmpne p5.d, p0/z, z3.d, #0 ; CHECK-NEXT: cmpne p6.d, p0/z, z2.d, #0 -; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s -; CHECK-NEXT: cmpne p2.d, p0/z, z1.d, #0 +; CHECK-NEXT: cmpne p7.d, p0/z, z1.d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: uzp1 p3.s, p4.s, p3.s -; CHECK-NEXT: uzp1 p4.s, p6.s, p5.s +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: uzp1 p2.s, p4.s, p3.s +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p3.s, p6.s, p5.s ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.s, p0.s, p2.s +; CHECK-NEXT: uzp1 p0.s, p0.s, p7.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p1.h, p3.h, p1.h -; CHECK-NEXT: uzp1 p0.h, p0.h, p4.h -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h ; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -155,8 +157,8 @@ entry: define @trunc_i32toi1( %in) { ; CHECK-LABEL: trunc_i32toi1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ret entry: @@ -167,8 +169,8 @@ entry: define @trunc_i16toi1( %in) { ; CHECK-LABEL: trunc_i16toi1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: ret entry: @@ -179,8 +181,8 @@ entry: define @trunc_i8toi1( %in) { ; CHECK-LABEL: trunc_i8toi1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; CHECK-NEXT: ret entry: @@ -191,8 +193,8 @@ entry: define @trunc_nxv1i32_to_nxv1i1( %in) { ; CHECK-LABEL: trunc_nxv1i32_to_nxv1i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b @@ -204,8 +206,8 @@ define @trunc_nxv1i32_to_nxv1i1( %in) { define void @trunc_promoteIntRes( %0, ptr %ptr) { ; CHECK-LABEL: trunc_promoteIntRes: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll index 36d64725742e57..818f37c85ffdb9 100644 --- a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll +++ b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll @@ -6,9 +6,9 @@ declare { , } @llvm.umul.with.overflow.nxv2i8 define @umulo_nxv2i8( %x, %y) { ; CHECK-LABEL: umulo_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0xff ; CHECK-NEXT: and z0.d, z0.d, #0xff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: lsr z1.d, z0.d, #8 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 @@ -26,9 +26,9 @@ declare { , } @llvm.umul.with.overflow.nxv4i8 define @umulo_nxv4i8( %x, %y) { ; CHECK-LABEL: umulo_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0xff ; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: lsr z1.s, z0.s, #8 ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 @@ -46,9 +46,9 @@ declare { , } @llvm.umul.with.overflow.nxv8i8 define @umulo_nxv8i8( %x, %y) { ; CHECK-LABEL: umulo_nxv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: lsr z1.h, z0.h, #8 ; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 @@ -119,9 +119,9 @@ define @umulo_nxv64i8( %x, , } @llvm.umul.with.overflow.nxv2i define @umulo_nxv2i16( %x, %y) { ; CHECK-LABEL: umulo_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0xffff ; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: lsr z1.d, z0.d, #16 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 @@ -163,9 +163,9 @@ declare { , } @llvm.umul.with.overflow.nxv4i define @umulo_nxv4i16( %x, %y) { ; CHECK-LABEL: umulo_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0xffff ; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: lsr z1.s, z0.s, #16 ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 @@ -236,9 +236,9 @@ define @umulo_nxv32i16( %x, , } @llvm.umul.with.overflow.nxv2i define @umulo_nxv2i32( %x, %y) { ; CHECK-LABEL: umulo_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: lsr z1.d, z0.d, #32 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 @@ -333,9 +333,9 @@ define @umulo_nxv16i32( %x, @umulo_nxv8i64( %x, %a, ptr %b) #0 { define void @uzp1_i8_invalid( %a, ptr %b) #0 { ; CHECK-LABEL: uzp1_i8_invalid: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %a.bc = bitcast %a to @@ -141,8 +141,8 @@ define void @uzp1_i16_valid( %a, ptr %b) #0 { define void @uzp1_i16_invalid( %a, ptr %b) #0 { ; CHECK-LABEL: uzp1_i16_invalid: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %a.bc = bitcast %a to @@ -168,8 +168,8 @@ define void @uzp1_i32_valid( %a, ptr %b) #0 { define void @uzp1_i32_invalid( %a, ptr %b) #0 { ; CHECK-LABEL: uzp1_i32_invalid: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %a.bc = bitcast %a to @@ -182,8 +182,8 @@ define void @uzp1_i32_invalid( %a, ptr %b) #0 { define void @uzp1_invalid_all( %a, ptr %b) #0 { ; CHECK-LABEL: uzp1_invalid_all: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %a.bc = bitcast %a to diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll index 194d1071301d4d..91f8f5c2c90d84 100644 --- a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll +++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll @@ -8,11 +8,11 @@ define i32 @test( %bin.rdx, %bin.rdx2) { ; CHECK-NEXT: sunpklo z5.h, z0.b ; CHECK-NEXT: sunpkhi z0.h, z0.b ; CHECK-NEXT: sunpkhi z2.h, z2.b -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z6.h, z1.b ; CHECK-NEXT: sunpkhi z1.h, z1.b ; CHECK-NEXT: sunpklo z7.h, z3.b ; CHECK-NEXT: sunpkhi z3.h, z3.b +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z24.s, z5.h ; CHECK-NEXT: sunpklo z5.s, z5.h ; CHECK-NEXT: sunpklo z25.s, z4.h diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll index 898090340869ee..0bdaefdfc2a3f0 100644 --- a/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll +++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll @@ -80,8 +80,8 @@ define i1 @reduce_and_insert_subvec_into_var( %in, @test_copysign_v4f32_v4f64( %a, ; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f32_v4f64: ; CHECK_NO_EXTEND_ROUND: // %bb.0: ; CHECK_NO_EXTEND_ROUND-NEXT: ptrue p0.d +; CHECK_NO_EXTEND_ROUND-NEXT: mov z3.s, #0x7fffffff ; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z2.s, p0/m, z2.d ; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z1.s, p0/m, z1.d ; CHECK_NO_EXTEND_ROUND-NEXT: uzp1 z1.s, z1.s, z2.s -; CHECK_NO_EXTEND_ROUND-NEXT: mov z2.s, #0x7fffffff -; CHECK_NO_EXTEND_ROUND-NEXT: bsl z0.d, z0.d, z1.d, z2.d +; CHECK_NO_EXTEND_ROUND-NEXT: bsl z0.d, z0.d, z1.d, z3.d ; CHECK_NO_EXTEND_ROUND-NEXT: ret ; ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f32_v4f64: @@ -107,9 +107,9 @@ declare @llvm.copysign.v2f64( %a, @test_copysign_v4f64_v4f32( %a, %b) #0 { ; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: ; CHECK_NO_EXTEND_ROUND: // %bb.0: -; CHECK_NO_EXTEND_ROUND-NEXT: ptrue p0.d ; CHECK_NO_EXTEND_ROUND-NEXT: uunpkhi z3.d, z2.s ; CHECK_NO_EXTEND_ROUND-NEXT: uunpklo z2.d, z2.s +; CHECK_NO_EXTEND_ROUND-NEXT: ptrue p0.d ; CHECK_NO_EXTEND_ROUND-NEXT: mov z4.d, #0x7fffffffffffffff ; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z3.d, p0/m, z3.s ; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z2.d, p0/m, z2.s @@ -119,9 +119,9 @@ define @test_copysign_v4f64_v4f32( %a ; ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: ; CHECK_EXTEND_ROUND: // %bb.0: -; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d ; CHECK_EXTEND_ROUND-NEXT: uunpkhi z3.d, z2.s ; CHECK_EXTEND_ROUND-NEXT: uunpklo z2.d, z2.s +; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d ; CHECK_EXTEND_ROUND-NEXT: mov z4.d, #0x7fffffffffffffff ; CHECK_EXTEND_ROUND-NEXT: fcvt z2.d, p0/m, z2.s ; CHECK_EXTEND_ROUND-NEXT: fcvt z3.d, p0/m, z3.s @@ -176,11 +176,11 @@ define @test_copysign_v4f16_v4f64( %a, @test_copysign_v8f16_v8f32( %a, %arg1){ ; CHECK-LABEL: wide_add_shift_add_rshrnb_b: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: rshrnb z1.b, z1.h, #6 ; CHECK-NEXT: rshrnb z0.b, z0.h, #6 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x1] ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, x1] -; CHECK-NEXT: add z0.b, z1.b, z0.b +; CHECK-NEXT: add z0.b, z2.b, z0.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0, x1] ; CHECK-NEXT: ret %1 = add %arg1, splat (i16 32) @@ -141,12 +141,12 @@ define void @wide_add_shift_add_rshrnb_b(ptr %dest, i64 %index, %arg1){ ; CHECK-LABEL: wide_add_shift_add_rshrnb_h: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: rshrnb z1.h, z1.s, #6 ; CHECK-NEXT: rshrnb z0.h, z0.s, #6 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEXT: add z0.h, z2.h, z0.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %1 = add %arg1, splat (i32 32) @@ -162,12 +162,12 @@ define void @wide_add_shift_add_rshrnb_h(ptr %dest, i64 %index, %arg1){ ; CHECK-LABEL: wide_add_shift_add_rshrnb_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: rshrnb z1.s, z1.d, #32 ; CHECK-NEXT: rshrnb z0.s, z0.d, #32 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x1, lsl #2] -; CHECK-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEXT: add z0.s, z2.s, z0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %1 = add %arg1, splat (i64 2147483648) @@ -188,11 +188,11 @@ define void @neg_wide_add_shift_add_rshrnb_d(ptr %dest, i64 %index, %arg1, splat (i64 140737488355328) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll index 8bfcd088e1a863..500973d053f5b8 100644 --- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll @@ -437,8 +437,8 @@ define @uqsub_i32_ptrue_all_h( %a) #0 { define @uqsub_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: uqsub_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uqsub z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) diff --git a/llvm/test/CodeGen/AArch64/sve2-rsh.ll b/llvm/test/CodeGen/AArch64/sve2-rsh.ll index 516ef3bd581ee2..9addd16f89292a 100644 --- a/llvm/test/CodeGen/AArch64/sve2-rsh.ll +++ b/llvm/test/CodeGen/AArch64/sve2-rsh.ll @@ -18,8 +18,8 @@ define @neg_urshr_1( %x) { define @neg_urshr_2( %x, %y) { ; CHECK-LABEL: neg_urshr_2: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add z0.d, z0.d, #32 // =0x20 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %add = add nuw nsw %x, splat (i64 32) diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll index e297ade6b9ae1f..e5a240b7a53fdc 100644 --- a/llvm/test/CodeGen/AArch64/sve2-xar.ll +++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll @@ -152,9 +152,9 @@ define @xar_nxv2i64_l_neg1( %x, , } @sel_x2_i8(target("aarch64.svc ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.b, z1.b }, pn8, { z6.b, z7.b }, { z4.b, z5.b } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -28,12 +28,12 @@ define { , } @sel_x2_i16(target("aarch64.sv ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -48,12 +48,12 @@ define { , } @sel_x2_f16(target("aarch64. ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -68,12 +68,12 @@ define { , } @sel_x2_bf16(target("aar ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -88,12 +88,12 @@ define { , } @sel_x2_i32(target("aarch64.sv ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -108,12 +108,12 @@ define { , } @sel_x2_f32(target("aarch6 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -128,12 +128,12 @@ define { , } @sel_x2_i64(target("aarch64.sv ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -148,12 +148,12 @@ define { , } @sel_x2_f64(target("aarc ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll index df504362680ba1..3a21eaead5f72e 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll @@ -8,17 +8,17 @@ define { , , , , , , , , , , , , , , , , , , , , , , , , %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: st1h { z2.h, z3.h }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -117,9 +117,9 @@ define void @st1_x2_f32( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: st1w { z2.s, z3.s }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -135,9 +135,9 @@ define void @st1_x2_f64( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: st1d { z2.d, z3.d }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -153,8 +153,8 @@ define void @st1_x4_i8( %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -273,8 +273,8 @@ define void @st1_x4_f32( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -293,8 +293,8 @@ define void @st1_x4_f64( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -315,9 +315,9 @@ define void @stnt1_x2_i8( %unused, %zn0, %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -351,9 +351,9 @@ define void @stnt1_x2_i32( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1w { z2.s, z3.s }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -369,9 +369,9 @@ define void @stnt1_x2_i64( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1d { z2.d, z3.d }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -387,9 +387,9 @@ define void @stnt1_x2_f16( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -405,9 +405,9 @@ define void @stnt1_x2_bf16( %unused, %zn ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -423,9 +423,9 @@ define void @stnt1_x2_f32( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1w { z2.s, z3.s }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -441,9 +441,9 @@ define void @stnt1_x2_f64( %unused, %zn0 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1d { z2.d, z3.d }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -459,8 +459,8 @@ define void @stnt1_x4_i8( %unused, %zn0, %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -499,8 +499,8 @@ define void @stnt1_x4_i32( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -519,8 +519,8 @@ define void @stnt1_x4_i64( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -539,8 +539,8 @@ define void @stnt1_x4_f16( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -559,8 +559,8 @@ define void @stnt1_x4_bf16( %unused, %zn ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -579,8 +579,8 @@ define void @stnt1_x4_f32( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -599,8 +599,8 @@ define void @stnt1_x4_f64( %unused, %zn0 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index 30ff70088454d7..16521834090b58 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -147,11 +147,11 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind { define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v4i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldr s1, [x0] -; CHECK-SD-NEXT: ldr s2, [x1] -; CHECK-SD-NEXT: movi d0, #0xff00ff00ff00ff -; CHECK-SD-NEXT: uaddl v1.8h, v1.8b, v2.8b -; CHECK-SD-NEXT: umin v0.4h, v1.4h, v0.4h +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: movi d2, #0xff00ff00ff00ff +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: umin v0.4h, v0.4h, v2.4h ; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: str s0, [x2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll index b31ce94cdaaea1..d5f1febaeb7dbb 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -52,11 +52,11 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: shl v1.4s, v0.4s, #31 -; CHECK-NEXT: usra v1.4s, v0.4s, #1 -; CHECK-NEXT: movi v0.16b, #170 -; CHECK-NEXT: fneg v0.4s, v0.4s -; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s +; CHECK-NEXT: movi v1.16b, #170 +; CHECK-NEXT: shl v2.4s, v0.4s, #31 +; CHECK-NEXT: fneg v1.4s, v1.4s +; CHECK-NEXT: usra v2.4s, v0.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v2.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll index 00609b0df9b4e1..37c6374215d811 100644 --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -100,9 +100,9 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; CHECK-NEXT: mov v0.s[3], w3 ; CHECK-NEXT: cmhi v3.4s, v3.4s, v2.4s ; CHECK-NEXT: str d2, [x8, #16] -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s ; CHECK-NEXT: mov w5, v3.s[1] ; CHECK-NEXT: fmov w4, s3 +; CHECK-NEXT: add v1.4s, v0.4s, v1.4s ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: str q1, [x8] ; CHECK-NEXT: mov w1, v0.s[1] @@ -248,10 +248,10 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; CHECK-NEXT: eor v2.8b, v0.8b, v1.8b ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: shl v2.4h, v2.4h, #15 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: cmlt v1.4h, v2.4h, #0 +; CHECK-NEXT: shl v1.4h, v2.4h, #15 ; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: cmlt v1.4h, v1.4h, #0 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b ; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 66ef436f48c637..3254c5ebe9c6b1 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -2177,65 +2177,65 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) { ; CHECK-GI-BASE-LABEL: test_udot_v48i8: ; CHECK-GI-BASE: // %bb.0: // %entry ; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1] -; CHECK-GI-BASE-NEXT: ldr q6, [x1, #32] +; CHECK-GI-BASE-NEXT: ldr q6, [x0, #32] ; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0] -; CHECK-GI-BASE-NEXT: ldr q17, [x0, #32] +; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32] +; CHECK-GI-BASE-NEXT: ushll v20.8h, v6.8b, #0 +; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v6.16b, #0 ; CHECK-GI-BASE-NEXT: ushll v4.8h, v0.8b, #0 ; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-GI-BASE-NEXT: ushll v7.8h, v3.8b, #0 +; CHECK-GI-BASE-NEXT: ushll v16.8h, v3.8b, #0 ; CHECK-GI-BASE-NEXT: ushll v5.8h, v1.8b, #0 ; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-GI-BASE-NEXT: ushll v16.8h, v2.8b, #0 +; CHECK-GI-BASE-NEXT: ushll v17.8h, v2.8b, #0 ; CHECK-GI-BASE-NEXT: ushll2 v3.8h, v3.16b, #0 ; CHECK-GI-BASE-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-GI-BASE-NEXT: umull v18.4s, v4.4h, v5.4h ; CHECK-GI-BASE-NEXT: umull2 v4.4s, v4.8h, v5.8h -; CHECK-GI-BASE-NEXT: umull2 v19.4s, v0.8h, v1.8h -; CHECK-GI-BASE-NEXT: umull v20.4s, v7.4h, v16.4h -; CHECK-GI-BASE-NEXT: umull v0.4s, v0.4h, v1.4h -; CHECK-GI-BASE-NEXT: ushll v5.8h, v6.8b, #0 -; CHECK-GI-BASE-NEXT: ushll v1.8h, v17.8b, #0 -; CHECK-GI-BASE-NEXT: umull2 v7.4s, v7.8h, v16.8h -; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v6.16b, #0 -; CHECK-GI-BASE-NEXT: ushll2 v17.8h, v17.16b, #0 -; CHECK-GI-BASE-NEXT: addv s16, v18.4s -; CHECK-GI-BASE-NEXT: addv s4, v4.4s -; CHECK-GI-BASE-NEXT: umull v18.4s, v3.4h, v2.4h +; CHECK-GI-BASE-NEXT: umull v5.4s, v0.4h, v1.4h +; CHECK-GI-BASE-NEXT: umull2 v0.4s, v0.8h, v1.8h +; CHECK-GI-BASE-NEXT: umull v19.4s, v16.4h, v17.4h +; CHECK-GI-BASE-NEXT: ushll v1.8h, v7.8b, #0 +; CHECK-GI-BASE-NEXT: umull2 v16.4s, v16.8h, v17.8h +; CHECK-GI-BASE-NEXT: umull v17.4s, v3.4h, v2.4h ; CHECK-GI-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h -; CHECK-GI-BASE-NEXT: addv s3, v19.4s -; CHECK-GI-BASE-NEXT: umull v19.4s, v5.4h, v1.4h -; CHECK-GI-BASE-NEXT: umull2 v1.4s, v5.8h, v1.8h -; CHECK-GI-BASE-NEXT: addv s5, v20.4s +; CHECK-GI-BASE-NEXT: ushll2 v7.8h, v7.16b, #0 +; CHECK-GI-BASE-NEXT: addv s18, v18.4s +; CHECK-GI-BASE-NEXT: addv s4, v4.4s +; CHECK-GI-BASE-NEXT: addv s5, v5.4s ; CHECK-GI-BASE-NEXT: addv s0, v0.4s -; CHECK-GI-BASE-NEXT: addv s7, v7.4s -; CHECK-GI-BASE-NEXT: umull v20.4s, v6.4h, v17.4h -; CHECK-GI-BASE-NEXT: umull2 v6.4s, v6.8h, v17.8h -; CHECK-GI-BASE-NEXT: fmov w8, s16 -; CHECK-GI-BASE-NEXT: fmov w9, s4 -; CHECK-GI-BASE-NEXT: fmov w10, s3 -; CHECK-GI-BASE-NEXT: addv s3, v18.4s +; CHECK-GI-BASE-NEXT: addv s19, v19.4s +; CHECK-GI-BASE-NEXT: umull v3.4s, v1.4h, v20.4h ; CHECK-GI-BASE-NEXT: addv s2, v2.4s -; CHECK-GI-BASE-NEXT: fmov w11, s5 -; CHECK-GI-BASE-NEXT: addv s4, v19.4s +; CHECK-GI-BASE-NEXT: umull2 v1.4s, v1.8h, v20.8h +; CHECK-GI-BASE-NEXT: umull v20.4s, v7.4h, v6.4h +; CHECK-GI-BASE-NEXT: fmov w8, s18 +; CHECK-GI-BASE-NEXT: fmov w9, s4 +; CHECK-GI-BASE-NEXT: fmov w10, s5 +; CHECK-GI-BASE-NEXT: fmov w11, s0 +; CHECK-GI-BASE-NEXT: fmov w12, s19 +; CHECK-GI-BASE-NEXT: addv s4, v16.4s +; CHECK-GI-BASE-NEXT: addv s5, v17.4s +; CHECK-GI-BASE-NEXT: addv s3, v3.4s +; CHECK-GI-BASE-NEXT: umull2 v0.4s, v7.8h, v6.8h ; CHECK-GI-BASE-NEXT: add w8, w8, w9 -; CHECK-GI-BASE-NEXT: fmov w9, s0 -; CHECK-GI-BASE-NEXT: addv s0, v1.4s -; CHECK-GI-BASE-NEXT: addv s1, v20.4s -; CHECK-GI-BASE-NEXT: addv s5, v6.4s -; CHECK-GI-BASE-NEXT: add w10, w10, w11 -; CHECK-GI-BASE-NEXT: fmov w11, s3 +; CHECK-GI-BASE-NEXT: addv s1, v1.4s +; CHECK-GI-BASE-NEXT: add w9, w11, w12 +; CHECK-GI-BASE-NEXT: add w8, w8, w10 +; CHECK-GI-BASE-NEXT: fmov w10, s4 +; CHECK-GI-BASE-NEXT: fmov w11, s5 ; CHECK-GI-BASE-NEXT: fmov w12, s2 -; CHECK-GI-BASE-NEXT: add w8, w8, w9 -; CHECK-GI-BASE-NEXT: fmov w9, s7 -; CHECK-GI-BASE-NEXT: add w9, w10, w9 +; CHECK-GI-BASE-NEXT: addv s4, v20.4s +; CHECK-GI-BASE-NEXT: addv s0, v0.4s +; CHECK-GI-BASE-NEXT: add w9, w9, w10 ; CHECK-GI-BASE-NEXT: add w10, w11, w12 -; CHECK-GI-BASE-NEXT: fmov w11, s4 +; CHECK-GI-BASE-NEXT: fmov w11, s3 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: add w9, w10, w11 -; CHECK-GI-BASE-NEXT: fmov w10, s0 -; CHECK-GI-BASE-NEXT: fmov w11, s5 -; CHECK-GI-BASE-NEXT: add w9, w9, w10 ; CHECK-GI-BASE-NEXT: fmov w10, s1 +; CHECK-GI-BASE-NEXT: fmov w11, s0 +; CHECK-GI-BASE-NEXT: add w9, w9, w10 +; CHECK-GI-BASE-NEXT: fmov w10, s4 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: add w9, w10, w11 ; CHECK-GI-BASE-NEXT: add w0, w8, w9 @@ -2527,65 +2527,65 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) { ; CHECK-GI-BASE-LABEL: test_sdot_v48i8: ; CHECK-GI-BASE: // %bb.0: // %entry ; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1] -; CHECK-GI-BASE-NEXT: ldr q6, [x1, #32] +; CHECK-GI-BASE-NEXT: ldr q6, [x0, #32] ; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0] -; CHECK-GI-BASE-NEXT: ldr q17, [x0, #32] +; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32] +; CHECK-GI-BASE-NEXT: sshll v20.8h, v6.8b, #0 +; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v6.16b, #0 ; CHECK-GI-BASE-NEXT: sshll v4.8h, v0.8b, #0 ; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0 -; CHECK-GI-BASE-NEXT: sshll v7.8h, v3.8b, #0 +; CHECK-GI-BASE-NEXT: sshll v16.8h, v3.8b, #0 ; CHECK-GI-BASE-NEXT: sshll v5.8h, v1.8b, #0 ; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0 -; CHECK-GI-BASE-NEXT: sshll v16.8h, v2.8b, #0 +; CHECK-GI-BASE-NEXT: sshll v17.8h, v2.8b, #0 ; CHECK-GI-BASE-NEXT: sshll2 v3.8h, v3.16b, #0 ; CHECK-GI-BASE-NEXT: sshll2 v2.8h, v2.16b, #0 ; CHECK-GI-BASE-NEXT: smull v18.4s, v4.4h, v5.4h ; CHECK-GI-BASE-NEXT: smull2 v4.4s, v4.8h, v5.8h -; CHECK-GI-BASE-NEXT: smull2 v19.4s, v0.8h, v1.8h -; CHECK-GI-BASE-NEXT: smull v20.4s, v7.4h, v16.4h -; CHECK-GI-BASE-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-GI-BASE-NEXT: sshll v5.8h, v6.8b, #0 -; CHECK-GI-BASE-NEXT: sshll v1.8h, v17.8b, #0 -; CHECK-GI-BASE-NEXT: smull2 v7.4s, v7.8h, v16.8h -; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v6.16b, #0 -; CHECK-GI-BASE-NEXT: sshll2 v17.8h, v17.16b, #0 -; CHECK-GI-BASE-NEXT: addv s16, v18.4s -; CHECK-GI-BASE-NEXT: addv s4, v4.4s -; CHECK-GI-BASE-NEXT: smull v18.4s, v3.4h, v2.4h +; CHECK-GI-BASE-NEXT: smull v5.4s, v0.4h, v1.4h +; CHECK-GI-BASE-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-GI-BASE-NEXT: smull v19.4s, v16.4h, v17.4h +; CHECK-GI-BASE-NEXT: sshll v1.8h, v7.8b, #0 +; CHECK-GI-BASE-NEXT: smull2 v16.4s, v16.8h, v17.8h +; CHECK-GI-BASE-NEXT: smull v17.4s, v3.4h, v2.4h ; CHECK-GI-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h -; CHECK-GI-BASE-NEXT: addv s3, v19.4s -; CHECK-GI-BASE-NEXT: smull v19.4s, v5.4h, v1.4h -; CHECK-GI-BASE-NEXT: smull2 v1.4s, v5.8h, v1.8h -; CHECK-GI-BASE-NEXT: addv s5, v20.4s +; CHECK-GI-BASE-NEXT: sshll2 v7.8h, v7.16b, #0 +; CHECK-GI-BASE-NEXT: addv s18, v18.4s +; CHECK-GI-BASE-NEXT: addv s4, v4.4s +; CHECK-GI-BASE-NEXT: addv s5, v5.4s ; CHECK-GI-BASE-NEXT: addv s0, v0.4s -; CHECK-GI-BASE-NEXT: addv s7, v7.4s -; CHECK-GI-BASE-NEXT: smull v20.4s, v6.4h, v17.4h -; CHECK-GI-BASE-NEXT: smull2 v6.4s, v6.8h, v17.8h -; CHECK-GI-BASE-NEXT: fmov w8, s16 -; CHECK-GI-BASE-NEXT: fmov w9, s4 -; CHECK-GI-BASE-NEXT: fmov w10, s3 -; CHECK-GI-BASE-NEXT: addv s3, v18.4s +; CHECK-GI-BASE-NEXT: addv s19, v19.4s +; CHECK-GI-BASE-NEXT: smull v3.4s, v1.4h, v20.4h ; CHECK-GI-BASE-NEXT: addv s2, v2.4s -; CHECK-GI-BASE-NEXT: fmov w11, s5 -; CHECK-GI-BASE-NEXT: addv s4, v19.4s +; CHECK-GI-BASE-NEXT: smull2 v1.4s, v1.8h, v20.8h +; CHECK-GI-BASE-NEXT: smull v20.4s, v7.4h, v6.4h +; CHECK-GI-BASE-NEXT: fmov w8, s18 +; CHECK-GI-BASE-NEXT: fmov w9, s4 +; CHECK-GI-BASE-NEXT: fmov w10, s5 +; CHECK-GI-BASE-NEXT: fmov w11, s0 +; CHECK-GI-BASE-NEXT: fmov w12, s19 +; CHECK-GI-BASE-NEXT: addv s4, v16.4s +; CHECK-GI-BASE-NEXT: addv s5, v17.4s +; CHECK-GI-BASE-NEXT: addv s3, v3.4s +; CHECK-GI-BASE-NEXT: smull2 v0.4s, v7.8h, v6.8h ; CHECK-GI-BASE-NEXT: add w8, w8, w9 -; CHECK-GI-BASE-NEXT: fmov w9, s0 -; CHECK-GI-BASE-NEXT: addv s0, v1.4s -; CHECK-GI-BASE-NEXT: addv s1, v20.4s -; CHECK-GI-BASE-NEXT: addv s5, v6.4s -; CHECK-GI-BASE-NEXT: add w10, w10, w11 -; CHECK-GI-BASE-NEXT: fmov w11, s3 +; CHECK-GI-BASE-NEXT: addv s1, v1.4s +; CHECK-GI-BASE-NEXT: add w9, w11, w12 +; CHECK-GI-BASE-NEXT: add w8, w8, w10 +; CHECK-GI-BASE-NEXT: fmov w10, s4 +; CHECK-GI-BASE-NEXT: fmov w11, s5 ; CHECK-GI-BASE-NEXT: fmov w12, s2 -; CHECK-GI-BASE-NEXT: add w8, w8, w9 -; CHECK-GI-BASE-NEXT: fmov w9, s7 -; CHECK-GI-BASE-NEXT: add w9, w10, w9 +; CHECK-GI-BASE-NEXT: addv s4, v20.4s +; CHECK-GI-BASE-NEXT: addv s0, v0.4s +; CHECK-GI-BASE-NEXT: add w9, w9, w10 ; CHECK-GI-BASE-NEXT: add w10, w11, w12 -; CHECK-GI-BASE-NEXT: fmov w11, s4 +; CHECK-GI-BASE-NEXT: fmov w11, s3 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: add w9, w10, w11 -; CHECK-GI-BASE-NEXT: fmov w10, s0 -; CHECK-GI-BASE-NEXT: fmov w11, s5 -; CHECK-GI-BASE-NEXT: add w9, w9, w10 ; CHECK-GI-BASE-NEXT: fmov w10, s1 +; CHECK-GI-BASE-NEXT: fmov w11, s0 +; CHECK-GI-BASE-NEXT: add w9, w9, w10 +; CHECK-GI-BASE-NEXT: fmov w10, s4 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: add w9, w10, w11 ; CHECK-GI-BASE-NEXT: add w0, w8, w9 @@ -2640,13 +2640,13 @@ define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) { ; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use: ; CHECK-SD-DOT: // %bb.0: // %entry ; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-DOT-NEXT: ushll v3.8h, v0.8b, #0 +; CHECK-SD-DOT-NEXT: ushll v4.8h, v1.8b, #0 ; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v0.8b -; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-DOT-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-SD-DOT-NEXT: umull v0.4s, v1.4h, v0.4h -; CHECK-SD-DOT-NEXT: addp v2.2s, v2.2s, v2.2s +; CHECK-SD-DOT-NEXT: umull v0.4s, v4.4h, v3.4h +; CHECK-SD-DOT-NEXT: addp v1.2s, v2.2s, v2.2s ; CHECK-SD-DOT-NEXT: fmov w9, s0 -; CHECK-SD-DOT-NEXT: fmov w8, s2 +; CHECK-SD-DOT-NEXT: fmov w8, s1 ; CHECK-SD-DOT-NEXT: add w0, w8, w9 ; CHECK-SD-DOT-NEXT: ret ; @@ -3534,21 +3534,21 @@ entry: define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) { ; CHECK-SD-LABEL: add_pair_v4i8_v4i64_sext: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-SD-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-SD-NEXT: ushll v3.2d, v1.2s, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v2.2d, v1.2s, #0 +; CHECK-SD-NEXT: ushll v3.2d, v0.2s, #0 ; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0 ; CHECK-SD-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-SD-NEXT: shl v2.2d, v2.2d, #56 ; CHECK-SD-NEXT: shl v3.2d, v3.2d, #56 +; CHECK-SD-NEXT: shl v2.2d, v2.2d, #56 ; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56 ; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56 -; CHECK-SD-NEXT: sshr v2.2d, v2.2d, #56 ; CHECK-SD-NEXT: sshr v3.2d, v3.2d, #56 -; CHECK-SD-NEXT: ssra v2.2d, v0.2d, #56 -; CHECK-SD-NEXT: ssra v3.2d, v1.2d, #56 -; CHECK-SD-NEXT: add v0.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: sshr v2.2d, v2.2d, #56 +; CHECK-SD-NEXT: ssra v3.2d, v0.2d, #56 +; CHECK-SD-NEXT: ssra v2.2d, v1.2d, #56 +; CHECK-SD-NEXT: add v0.2d, v3.2d, v2.2d ; CHECK-SD-NEXT: addp d0, v0.2d ; CHECK-SD-NEXT: fmov x0, d0 ; CHECK-SD-NEXT: ret @@ -3816,37 +3816,37 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) { ; CHECK-SD-NEXT: ldr b1, [sp, #64] ; CHECK-SD-NEXT: add x8, sp, #72 ; CHECK-SD-NEXT: ldr b2, [sp] -; CHECK-SD-NEXT: add x9, sp, #8 +; CHECK-SD-NEXT: add x9, sp, #80 ; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #80 +; CHECK-SD-NEXT: add x8, sp, #8 ; CHECK-SD-NEXT: mov v0.b[1], w1 -; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-SD-NEXT: add x9, sp, #16 -; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #88 -; CHECK-SD-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-SD-NEXT: add x9, sp, #24 +; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #16 +; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-SD-NEXT: add x9, sp, #88 +; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #24 ; CHECK-SD-NEXT: mov v0.b[2], w2 -; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #96 -; CHECK-SD-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-SD-NEXT: add x9, sp, #32 +; CHECK-SD-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-SD-NEXT: add x9, sp, #96 +; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #32 ; CHECK-SD-NEXT: mov v0.b[3], w3 -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #104 -; CHECK-SD-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-SD-NEXT: add x9, sp, #40 -; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #112 +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: add x9, sp, #104 +; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #40 +; CHECK-SD-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-SD-NEXT: add x9, sp, #112 ; CHECK-SD-NEXT: mov v0.b[4], w4 -; CHECK-SD-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-SD-NEXT: add x9, sp, #48 -; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #120 -; CHECK-SD-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-SD-NEXT: mov v0.b[5], w5 -; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #48 +; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-SD-NEXT: add x9, sp, #120 +; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #56 +; CHECK-SD-NEXT: mov v0.b[5], w5 +; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9] ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] ; CHECK-SD-NEXT: mov v0.b[6], w6 ; CHECK-SD-NEXT: mov v0.b[7], w7 @@ -3942,37 +3942,37 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) { ; CHECK-SD-NEXT: ldr b1, [sp, #64] ; CHECK-SD-NEXT: add x8, sp, #72 ; CHECK-SD-NEXT: ldr b2, [sp] -; CHECK-SD-NEXT: add x9, sp, #8 +; CHECK-SD-NEXT: add x9, sp, #80 ; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #80 +; CHECK-SD-NEXT: add x8, sp, #8 ; CHECK-SD-NEXT: mov v0.b[1], w1 -; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-SD-NEXT: add x9, sp, #16 -; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #88 -; CHECK-SD-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-SD-NEXT: add x9, sp, #24 +; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #16 +; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-SD-NEXT: add x9, sp, #88 +; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #24 ; CHECK-SD-NEXT: mov v0.b[2], w2 -; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #96 -; CHECK-SD-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-SD-NEXT: add x9, sp, #32 +; CHECK-SD-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-SD-NEXT: add x9, sp, #96 +; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #32 ; CHECK-SD-NEXT: mov v0.b[3], w3 -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #104 -; CHECK-SD-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-SD-NEXT: add x9, sp, #40 -; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #112 +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: add x9, sp, #104 +; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #40 +; CHECK-SD-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-SD-NEXT: add x9, sp, #112 ; CHECK-SD-NEXT: mov v0.b[4], w4 -; CHECK-SD-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-SD-NEXT: add x9, sp, #48 -; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #120 -; CHECK-SD-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-SD-NEXT: mov v0.b[5], w5 -; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #48 +; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-SD-NEXT: add x9, sp, #120 +; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #56 +; CHECK-SD-NEXT: mov v0.b[5], w5 +; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9] ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] ; CHECK-SD-NEXT: mov v0.b[6], w6 ; CHECK-SD-NEXT: mov v0.b[7], w7 @@ -4069,48 +4069,48 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) { ; CHECK-SD-BASE-NEXT: ldr b1, [sp, #64] ; CHECK-SD-BASE-NEXT: add x8, sp, #72 ; CHECK-SD-BASE-NEXT: ldr b2, [sp] -; CHECK-SD-BASE-NEXT: add x9, sp, #8 +; CHECK-SD-BASE-NEXT: add x9, sp, #80 ; CHECK-SD-BASE-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #80 +; CHECK-SD-BASE-NEXT: add x8, sp, #8 ; CHECK-SD-BASE-NEXT: mov v0.b[1], w1 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #16 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #88 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #24 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #16 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #88 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #24 ; CHECK-SD-BASE-NEXT: mov v0.b[2], w2 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #96 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #32 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #96 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #32 ; CHECK-SD-BASE-NEXT: mov v0.b[3], w3 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #104 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #40 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #112 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #104 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #40 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #112 ; CHECK-SD-BASE-NEXT: mov v0.b[4], w4 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #48 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #120 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-SD-BASE-NEXT: mov v0.b[5], w5 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #48 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #120 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x8] ; CHECK-SD-BASE-NEXT: add x8, sp, #56 +; CHECK-SD-BASE-NEXT: mov v0.b[5], w5 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x9] ; CHECK-SD-BASE-NEXT: ld1 { v2.b }[7], [x8] ; CHECK-SD-BASE-NEXT: mov v0.b[6], w6 ; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-BASE-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-SD-BASE-NEXT: mov v0.b[7], w7 ; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-SD-BASE-NEXT: uaddl2 v3.4s, v0.8h, v1.8h ; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h -; CHECK-SD-BASE-NEXT: ushll v1.8h, v2.8b, #0 -; CHECK-SD-BASE-NEXT: uaddw2 v2.4s, v3.4s, v1.8h -; CHECK-SD-BASE-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-SD-BASE-NEXT: uaddw2 v1.4s, v3.4s, v2.8h +; CHECK-SD-BASE-NEXT: uaddw v0.4s, v0.4s, v2.4h +; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-SD-BASE-NEXT: addv s0, v0.4s ; CHECK-SD-BASE-NEXT: fmov w0, s0 ; CHECK-SD-BASE-NEXT: ret @@ -4147,9 +4147,9 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) { ; CHECK-SD-DOT-NEXT: udot v4.2s, v1.8b, v5.8b ; CHECK-SD-DOT-NEXT: mov v0.b[7], w7 ; CHECK-SD-DOT-NEXT: addp v1.2s, v4.2s, v4.2s +; CHECK-SD-DOT-NEXT: fmov w9, s1 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[8], [x8] ; CHECK-SD-DOT-NEXT: add x8, sp, #8 -; CHECK-SD-DOT-NEXT: fmov w9, s1 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[9], [x8] ; CHECK-SD-DOT-NEXT: add x8, sp, #16 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[10], [x8] @@ -4342,48 +4342,48 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) { ; CHECK-SD-BASE-NEXT: ldr b1, [sp, #64] ; CHECK-SD-BASE-NEXT: add x8, sp, #72 ; CHECK-SD-BASE-NEXT: ldr b2, [sp] -; CHECK-SD-BASE-NEXT: add x9, sp, #8 +; CHECK-SD-BASE-NEXT: add x9, sp, #80 ; CHECK-SD-BASE-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #80 +; CHECK-SD-BASE-NEXT: add x8, sp, #8 ; CHECK-SD-BASE-NEXT: mov v0.b[1], w1 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #16 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #88 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #24 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #16 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #88 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #24 ; CHECK-SD-BASE-NEXT: mov v0.b[2], w2 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #96 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #32 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #96 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #32 ; CHECK-SD-BASE-NEXT: mov v0.b[3], w3 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #104 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #40 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #112 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #104 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #40 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #112 ; CHECK-SD-BASE-NEXT: mov v0.b[4], w4 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #48 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #120 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-SD-BASE-NEXT: mov v0.b[5], w5 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #48 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #120 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x8] ; CHECK-SD-BASE-NEXT: add x8, sp, #56 +; CHECK-SD-BASE-NEXT: mov v0.b[5], w5 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x9] ; CHECK-SD-BASE-NEXT: ld1 { v2.b }[7], [x8] ; CHECK-SD-BASE-NEXT: mov v0.b[6], w6 ; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-SD-BASE-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-SD-BASE-NEXT: mov v0.b[7], w7 ; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-SD-BASE-NEXT: saddl2 v3.4s, v0.8h, v1.8h ; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h -; CHECK-SD-BASE-NEXT: sshll v1.8h, v2.8b, #0 -; CHECK-SD-BASE-NEXT: saddw2 v2.4s, v3.4s, v1.8h -; CHECK-SD-BASE-NEXT: saddw v0.4s, v0.4s, v1.4h -; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-SD-BASE-NEXT: saddw2 v1.4s, v3.4s, v2.8h +; CHECK-SD-BASE-NEXT: saddw v0.4s, v0.4s, v2.4h +; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-SD-BASE-NEXT: addv s0, v0.4s ; CHECK-SD-BASE-NEXT: fmov w0, s0 ; CHECK-SD-BASE-NEXT: ret @@ -4420,9 +4420,9 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) { ; CHECK-SD-DOT-NEXT: sdot v4.2s, v1.8b, v5.8b ; CHECK-SD-DOT-NEXT: mov v0.b[7], w7 ; CHECK-SD-DOT-NEXT: addp v1.2s, v4.2s, v4.2s +; CHECK-SD-DOT-NEXT: fmov w9, s1 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[8], [x8] ; CHECK-SD-DOT-NEXT: add x8, sp, #8 -; CHECK-SD-DOT-NEXT: fmov w9, s1 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[9], [x8] ; CHECK-SD-DOT-NEXT: add x8, sp, #16 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[10], [x8] @@ -4611,23 +4611,23 @@ entry: define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-SD-BASE-LABEL: full: ; CHECK-SD-BASE: // %bb.0: // %entry -; CHECK-SD-BASE-NEXT: ldr d0, [x2] -; CHECK-SD-BASE-NEXT: ldr d1, [x0] ; CHECK-SD-BASE-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-SD-BASE-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-SD-BASE-NEXT: sxtw x8, w3 ; CHECK-SD-BASE-NEXT: sxtw x9, w1 -; CHECK-SD-BASE-NEXT: uabdl v0.8h, v1.8b, v0.8b -; CHECK-SD-BASE-NEXT: add x11, x2, x8 +; CHECK-SD-BASE-NEXT: ldr d0, [x0] +; CHECK-SD-BASE-NEXT: ldr d1, [x2] ; CHECK-SD-BASE-NEXT: add x10, x0, x9 -; CHECK-SD-BASE-NEXT: ldr d2, [x11] -; CHECK-SD-BASE-NEXT: add x11, x11, x8 +; CHECK-SD-BASE-NEXT: add x11, x2, x8 +; CHECK-SD-BASE-NEXT: uabdl v0.8h, v0.8b, v1.8b ; CHECK-SD-BASE-NEXT: ldr d1, [x10] +; CHECK-SD-BASE-NEXT: ldr d2, [x11] ; CHECK-SD-BASE-NEXT: add x10, x10, x9 -; CHECK-SD-BASE-NEXT: uaddlp v0.4s, v0.8h +; CHECK-SD-BASE-NEXT: add x11, x11, x8 ; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b ; CHECK-SD-BASE-NEXT: ldr d2, [x11] ; CHECK-SD-BASE-NEXT: add x11, x11, x8 +; CHECK-SD-BASE-NEXT: uaddlp v0.4s, v0.8h ; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h ; CHECK-SD-BASE-NEXT: ldr d1, [x10] ; CHECK-SD-BASE-NEXT: add x10, x10, x9 @@ -4723,98 +4723,98 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w3 killed $w3 def $x3 -; CHECK-GI-NEXT: sxtw x8, w1 -; CHECK-GI-NEXT: sxtw x9, w3 +; CHECK-GI-NEXT: sxtw x8, w3 +; CHECK-GI-NEXT: sxtw x9, w1 ; CHECK-GI-NEXT: ldr d0, [x0] ; CHECK-GI-NEXT: ldr d1, [x2] -; CHECK-GI-NEXT: add x10, x0, x8 -; CHECK-GI-NEXT: add x11, x2, x9 +; CHECK-GI-NEXT: add x10, x0, x9 +; CHECK-GI-NEXT: add x11, x2, x8 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: ldr d2, [x10] -; CHECK-GI-NEXT: ldr d3, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x12, x11, x8 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: add x10, x10, x8 -; CHECK-GI-NEXT: add x11, x11, x9 +; CHECK-GI-NEXT: ldr d3, [x11] +; CHECK-GI-NEXT: ldr d4, [x10] +; CHECK-GI-NEXT: ldr d5, [x12] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x12, x8 ; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-GI-NEXT: ldr d4, [x10] -; CHECK-GI-NEXT: ldr d5, [x11] -; CHECK-GI-NEXT: add x10, x10, x8 -; CHECK-GI-NEXT: add x11, x11, x9 +; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0 ; CHECK-GI-NEXT: uabdl v6.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: uabdl2 v0.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: ldr d1, [x10] -; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0 ; CHECK-GI-NEXT: ldr d7, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 ; CHECK-GI-NEXT: uabdl v16.4s, v2.4h, v3.4h ; CHECK-GI-NEXT: uabdl2 v2.4s, v2.8h, v3.8h -; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll v7.8h, v7.8b, #0 -; CHECK-GI-NEXT: add x10, x10, x8 -; CHECK-GI-NEXT: add x11, x11, x9 -; CHECK-GI-NEXT: uabdl v1.4s, v4.4h, v5.4h +; CHECK-GI-NEXT: uabdl v3.4s, v4.4h, v5.4h ; CHECK-GI-NEXT: uabdl2 v4.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v7.8h, v7.8b, #0 ; CHECK-GI-NEXT: ldr d5, [x10] -; CHECK-GI-NEXT: add v2.4s, v16.4s, v2.4s -; CHECK-GI-NEXT: ldr d16, [x11] +; CHECK-GI-NEXT: ldr d17, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 ; CHECK-GI-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-GI-NEXT: uabdl v6.4s, v3.4h, v7.4h -; CHECK-GI-NEXT: uabdl2 v3.4s, v3.8h, v7.8h ; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0 -; CHECK-GI-NEXT: add x10, x10, x8 -; CHECK-GI-NEXT: ushll v7.8h, v16.8b, #0 -; CHECK-GI-NEXT: add x11, x11, x9 -; CHECK-GI-NEXT: ldr d16, [x10] -; CHECK-GI-NEXT: ldr d17, [x11] -; CHECK-GI-NEXT: add v1.4s, v1.4s, v4.4s -; CHECK-GI-NEXT: add x10, x10, x8 -; CHECK-GI-NEXT: add x11, x11, x9 -; CHECK-GI-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-GI-NEXT: ushll v16.8h, v16.8b, #0 ; CHECK-GI-NEXT: ushll v17.8h, v17.8b, #0 -; CHECK-GI-NEXT: uabdl v22.4s, v5.4h, v7.4h -; CHECK-GI-NEXT: uabdl2 v5.4s, v5.8h, v7.8h +; CHECK-GI-NEXT: add v2.4s, v16.4s, v2.4s +; CHECK-GI-NEXT: add v3.4s, v3.4s, v4.4s +; CHECK-GI-NEXT: uabdl v4.4s, v1.4h, v7.4h +; CHECK-GI-NEXT: uabdl2 v1.4s, v1.8h, v7.8h +; CHECK-GI-NEXT: ldr d7, [x10] +; CHECK-GI-NEXT: ldr d16, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 ; CHECK-GI-NEXT: ldr d18, [x10] +; CHECK-GI-NEXT: ldr d20, [x10, x9] ; CHECK-GI-NEXT: ldr d19, [x11] -; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: ldr d21, [x11, x8] +; CHECK-GI-NEXT: uabdl v6.4s, v5.4h, v17.4h +; CHECK-GI-NEXT: ushll v7.8h, v7.8b, #0 +; CHECK-GI-NEXT: ushll v16.8h, v16.8b, #0 +; CHECK-GI-NEXT: uabdl2 v5.4s, v5.8h, v17.8h +; CHECK-GI-NEXT: ushll v17.8h, v18.8b, #0 +; CHECK-GI-NEXT: ushll v18.8h, v19.8b, #0 +; CHECK-GI-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-GI-NEXT: ushll v4.8h, v20.8b, #0 +; CHECK-GI-NEXT: ushll v19.8h, v21.8b, #0 ; CHECK-GI-NEXT: addv s2, v2.4s -; CHECK-GI-NEXT: addv s1, v1.4s -; CHECK-GI-NEXT: ushll v18.8h, v18.8b, #0 -; CHECK-GI-NEXT: ushll v19.8h, v19.8b, #0 -; CHECK-GI-NEXT: uabdl v4.4s, v16.4h, v17.4h -; CHECK-GI-NEXT: uabdl2 v16.4s, v16.8h, v17.8h -; CHECK-GI-NEXT: add v5.4s, v22.4s, v5.4s -; CHECK-GI-NEXT: ldr d20, [x10, x8] -; CHECK-GI-NEXT: ldr d21, [x11, x9] +; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: addv s3, v3.4s +; CHECK-GI-NEXT: uabdl v20.4s, v7.4h, v16.4h +; CHECK-GI-NEXT: uabdl2 v7.4s, v7.8h, v16.8h +; CHECK-GI-NEXT: add v5.4s, v6.4s, v5.4s +; CHECK-GI-NEXT: uabdl v6.4s, v17.4h, v18.4h +; CHECK-GI-NEXT: uabdl2 v16.4s, v17.8h, v18.8h +; CHECK-GI-NEXT: uabdl v17.4s, v4.4h, v19.4h +; CHECK-GI-NEXT: uabdl2 v4.4s, v4.8h, v19.8h ; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: addv s1, v1.4s ; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: ushll v7.8h, v20.8b, #0 -; CHECK-GI-NEXT: ushll v20.8h, v21.8b, #0 -; CHECK-GI-NEXT: uabdl v6.4s, v18.4h, v19.4h -; CHECK-GI-NEXT: uabdl2 v17.4s, v18.8h, v19.8h -; CHECK-GI-NEXT: add v4.4s, v4.4s, v16.4s -; CHECK-GI-NEXT: addv s5, v5.4s -; CHECK-GI-NEXT: fmov w10, s1 +; CHECK-GI-NEXT: fmov w10, s3 +; CHECK-GI-NEXT: add v7.4s, v20.4s, v7.4s +; CHECK-GI-NEXT: add v0.4s, v17.4s, v4.4s +; CHECK-GI-NEXT: addv s4, v5.4s +; CHECK-GI-NEXT: add v2.4s, v6.4s, v16.4s ; CHECK-GI-NEXT: add w8, w8, w9 -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: uabdl v18.4s, v7.4h, v20.4h -; CHECK-GI-NEXT: uabdl2 v7.4s, v7.8h, v20.8h -; CHECK-GI-NEXT: add v6.4s, v6.4s, v17.4s +; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: add w8, w10, w8 -; CHECK-GI-NEXT: addv s0, v4.4s +; CHECK-GI-NEXT: addv s3, v7.4s +; CHECK-GI-NEXT: addv s1, v2.4s +; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: add w8, w9, w8 -; CHECK-GI-NEXT: fmov w9, s5 -; CHECK-GI-NEXT: add v7.4s, v18.4s, v7.4s -; CHECK-GI-NEXT: addv s1, v6.4s +; CHECK-GI-NEXT: fmov w9, s4 ; CHECK-GI-NEXT: add w8, w9, w8 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: addv s2, v7.4s +; CHECK-GI-NEXT: fmov w9, s3 ; CHECK-GI-NEXT: add w8, w9, w8 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: add w8, w9, w8 -; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: fmov w9, s0 ; CHECK-GI-NEXT: add w0, w9, w8 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll index de26676b5c73ee..063b23275c6167 100644 --- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll @@ -264,15 +264,15 @@ define <4 x bfloat> @test_copysign_v4bf16_v4bf16(<4 x bfloat> %a, <4 x bfloat> % define <4 x bfloat> @test_copysign_v4bf16_v4f32(<4 x bfloat> %a, <4 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v4bf16_v4f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.4s v2, #127, msl #8 -; CHECK-NEXT: movi.4s v3, #1 +; CHECK-NEXT: movi.4s v2, #1 +; CHECK-NEXT: movi.4s v3, #127, msl #8 ; CHECK-NEXT: ushr.4s v4, v1, #16 -; CHECK-NEXT: add.4s v2, v1, v2 -; CHECK-NEXT: and.16b v3, v4, v3 -; CHECK-NEXT: add.4s v2, v3, v2 -; CHECK-NEXT: fcmeq.4s v3, v1, v1 +; CHECK-NEXT: and.16b v2, v4, v2 +; CHECK-NEXT: add.4s v3, v1, v3 +; CHECK-NEXT: fcmeq.4s v4, v1, v1 ; CHECK-NEXT: orr.4s v1, #64, lsl #16 -; CHECK-NEXT: bit.16b v1, v2, v3 +; CHECK-NEXT: add.4s v2, v2, v3 +; CHECK-NEXT: bit.16b v1, v2, v4 ; CHECK-NEXT: mvni.4h v2, #128, lsl #8 ; CHECK-NEXT: shrn.4h v1, v1, #16 ; CHECK-NEXT: bif.8b v0, v1, v2 @@ -286,16 +286,16 @@ define <4 x bfloat> @test_copysign_v4bf16_v4f64(<4 x bfloat> %a, <4 x double> %b ; CHECK-LABEL: test_copysign_v4bf16_v4f64: ; CHECK: ; %bb.0: ; CHECK-NEXT: fcvtxn v1.2s, v1.2d -; CHECK-NEXT: movi.4s v3, #1 +; CHECK-NEXT: movi.4s v3, #127, msl #8 ; CHECK-NEXT: fcvtxn2 v1.4s, v2.2d -; CHECK-NEXT: movi.4s v2, #127, msl #8 +; CHECK-NEXT: movi.4s v2, #1 ; CHECK-NEXT: ushr.4s v4, v1, #16 -; CHECK-NEXT: add.4s v2, v1, v2 -; CHECK-NEXT: and.16b v3, v4, v3 -; CHECK-NEXT: add.4s v2, v3, v2 -; CHECK-NEXT: fcmeq.4s v3, v1, v1 +; CHECK-NEXT: add.4s v3, v1, v3 +; CHECK-NEXT: and.16b v2, v4, v2 +; CHECK-NEXT: fcmeq.4s v4, v1, v1 ; CHECK-NEXT: orr.4s v1, #64, lsl #16 -; CHECK-NEXT: bit.16b v1, v2, v3 +; CHECK-NEXT: add.4s v2, v2, v3 +; CHECK-NEXT: bit.16b v1, v2, v4 ; CHECK-NEXT: mvni.4h v2, #128, lsl #8 ; CHECK-NEXT: shrn.4h v1, v1, #16 ; CHECK-NEXT: bif.8b v0, v1, v2 @@ -322,22 +322,22 @@ define <8 x bfloat> @test_copysign_v8bf16_v8bf16(<8 x bfloat> %a, <8 x bfloat> % define <8 x bfloat> @test_copysign_v8bf16_v8f32(<8 x bfloat> %a, <8 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v8bf16_v8f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.4s v3, #127, msl #8 -; CHECK-NEXT: movi.4s v4, #1 +; CHECK-NEXT: movi.4s v3, #1 +; CHECK-NEXT: movi.4s v4, #127, msl #8 ; CHECK-NEXT: ushr.4s v5, v2, #16 ; CHECK-NEXT: ushr.4s v6, v1, #16 -; CHECK-NEXT: add.4s v7, v2, v3 -; CHECK-NEXT: add.4s v3, v1, v3 -; CHECK-NEXT: and.16b v5, v5, v4 -; CHECK-NEXT: and.16b v4, v6, v4 +; CHECK-NEXT: and.16b v5, v5, v3 +; CHECK-NEXT: add.4s v7, v2, v4 +; CHECK-NEXT: and.16b v3, v6, v3 +; CHECK-NEXT: add.4s v4, v1, v4 ; CHECK-NEXT: fcmeq.4s v6, v2, v2 ; CHECK-NEXT: orr.4s v2, #64, lsl #16 ; CHECK-NEXT: add.4s v5, v5, v7 -; CHECK-NEXT: add.4s v3, v4, v3 -; CHECK-NEXT: fcmeq.4s v4, v1, v1 +; CHECK-NEXT: fcmeq.4s v7, v1, v1 ; CHECK-NEXT: orr.4s v1, #64, lsl #16 +; CHECK-NEXT: add.4s v3, v3, v4 ; CHECK-NEXT: bit.16b v2, v5, v6 -; CHECK-NEXT: bit.16b v1, v3, v4 +; CHECK-NEXT: bit.16b v1, v3, v7 ; CHECK-NEXT: uzp2.8h v1, v1, v2 ; CHECK-NEXT: mvni.8h v2, #128, lsl #8 ; CHECK-NEXT: bif.16b v0, v1, v2 diff --git a/llvm/test/CodeGen/AArch64/vector-gep.ll b/llvm/test/CodeGen/AArch64/vector-gep.ll index 30317dce85e656..c7858416e1796e 100644 --- a/llvm/test/CodeGen/AArch64/vector-gep.ll +++ b/llvm/test/CodeGen/AArch64/vector-gep.ll @@ -13,11 +13,11 @@ define <2 x ptr> @vector_gep(<2 x ptr> %0) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: Lloh0: ; CHECK-NEXT: adrp x8, lCPI0_0@PAGE +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: Lloh1: ; CHECK-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF] ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 entry: diff --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll index a32147eebd7592..5e6ff1e0740ce3 100644 --- a/llvm/test/CodeGen/AArch64/vselect-constants.ll +++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll @@ -370,10 +370,9 @@ define @signbit_mask_xor_nxv16i8( %a, %a, zeroinitializer %xor = xor %a, %b diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 08ad34c7b03ba0..599bd811d7d598 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1697,14 +1697,14 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-NEXT: ldp q18, q16, [x10, #96] ; CHECK-NEXT: uaddw.2d v2, v17, v2 ; CHECK-NEXT: stp q4, q5, [x10, #32] -; CHECK-NEXT: ldp q17, q5, [x10, #64] -; CHECK-NEXT: uaddw2.2d v16, v16, v7 +; CHECK-NEXT: uaddw2.2d v5, v16, v7 +; CHECK-NEXT: ldp q16, q4, [x10, #64] ; CHECK-NEXT: uaddw.2d v7, v18, v7 ; CHECK-NEXT: stp q2, q6, [x10] -; CHECK-NEXT: uaddw2.2d v4, v5, v3 -; CHECK-NEXT: uaddw.2d v3, v17, v3 -; CHECK-NEXT: stp q7, q16, [x10, #96] -; CHECK-NEXT: stp q3, q4, [x10, #64] +; CHECK-NEXT: uaddw2.2d v4, v4, v3 +; CHECK-NEXT: uaddw.2d v2, v16, v3 +; CHECK-NEXT: stp q7, q5, [x10, #96] +; CHECK-NEXT: stp q2, q4, [x10, #64] ; CHECK-NEXT: b.ne LBB17_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret @@ -1729,15 +1729,15 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: ld1 { v3.8b }, [x10] ; CHECK-BE-NEXT: add x10, x1, x8 ; CHECK-BE-NEXT: add x8, x8, #128 -; CHECK-BE-NEXT: add x15, x10, #96 ; CHECK-BE-NEXT: add x11, x10, #32 ; CHECK-BE-NEXT: add x14, x10, #64 +; CHECK-BE-NEXT: add x15, x10, #96 ; CHECK-BE-NEXT: tbl v4.16b, { v2.16b }, v1.16b ; CHECK-BE-NEXT: tbl v2.16b, { v2.16b }, v0.16b -; CHECK-BE-NEXT: ld1 { v16.2d }, [x15] -; CHECK-BE-NEXT: tbl v5.16b, { v3.16b }, v1.16b +; CHECK-BE-NEXT: ld1 { v5.2d }, [x10] +; CHECK-BE-NEXT: tbl v6.16b, { v3.16b }, v1.16b ; CHECK-BE-NEXT: tbl v3.16b, { v3.16b }, v0.16b -; CHECK-BE-NEXT: ld1 { v6.2d }, [x10] +; CHECK-BE-NEXT: ld1 { v16.2d }, [x15] ; CHECK-BE-NEXT: ld1 { v19.2d }, [x14] ; CHECK-BE-NEXT: ld1 { v21.2d }, [x11] ; CHECK-BE-NEXT: add x12, x10, #48 @@ -1747,11 +1747,12 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: rev32 v7.8b, v4.8b ; CHECK-BE-NEXT: ext v4.16b, v4.16b, v4.16b, #8 ; CHECK-BE-NEXT: rev32 v17.8b, v2.8b -; CHECK-BE-NEXT: ext v18.16b, v5.16b, v5.16b, #8 +; CHECK-BE-NEXT: ext v18.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: ext v20.16b, v3.16b, v3.16b, #8 ; CHECK-BE-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-BE-NEXT: rev32 v5.8b, v5.8b +; CHECK-BE-NEXT: rev32 v6.8b, v6.8b ; CHECK-BE-NEXT: rev32 v3.8b, v3.8b +; CHECK-BE-NEXT: ld1 { v22.2d }, [x12] ; CHECK-BE-NEXT: cmp x8, #1024 ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: uaddw v7.2d, v16.2d, v7.2s @@ -1760,22 +1761,21 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: rev32 v20.8b, v20.8b ; CHECK-BE-NEXT: rev32 v2.8b, v2.8b ; CHECK-BE-NEXT: uaddw v17.2d, v19.2d, v17.2s -; CHECK-BE-NEXT: ld1 { v19.2d }, [x12] -; CHECK-BE-NEXT: uaddw v5.2d, v21.2d, v5.2s -; CHECK-BE-NEXT: ld1 { v21.2d }, [x13] -; CHECK-BE-NEXT: uaddw v3.2d, v6.2d, v3.2s -; CHECK-BE-NEXT: ld1 { v6.2d }, [x17] -; CHECK-BE-NEXT: uaddw v4.2d, v16.2d, v4.2s +; CHECK-BE-NEXT: ld1 { v19.2d }, [x13] +; CHECK-BE-NEXT: uaddw v6.2d, v21.2d, v6.2s +; CHECK-BE-NEXT: uaddw v3.2d, v5.2d, v3.2s +; CHECK-BE-NEXT: ld1 { v5.2d }, [x17] ; CHECK-BE-NEXT: st1 { v7.2d }, [x15] -; CHECK-BE-NEXT: uaddw v7.2d, v19.2d, v18.2s -; CHECK-BE-NEXT: uaddw v16.2d, v21.2d, v20.2s -; CHECK-BE-NEXT: uaddw v2.2d, v6.2d, v2.2s -; CHECK-BE-NEXT: st1 { v17.2d }, [x14] -; CHECK-BE-NEXT: st1 { v5.2d }, [x11] +; CHECK-BE-NEXT: uaddw v4.2d, v16.2d, v4.2s +; CHECK-BE-NEXT: st1 { v6.2d }, [x11] +; CHECK-BE-NEXT: uaddw v6.2d, v22.2d, v18.2s ; CHECK-BE-NEXT: st1 { v3.2d }, [x10] +; CHECK-BE-NEXT: uaddw v3.2d, v19.2d, v20.2s +; CHECK-BE-NEXT: uaddw v2.2d, v5.2d, v2.2s +; CHECK-BE-NEXT: st1 { v17.2d }, [x14] ; CHECK-BE-NEXT: st1 { v4.2d }, [x16] -; CHECK-BE-NEXT: st1 { v7.2d }, [x12] -; CHECK-BE-NEXT: st1 { v16.2d }, [x13] +; CHECK-BE-NEXT: st1 { v6.2d }, [x12] +; CHECK-BE-NEXT: st1 { v3.2d }, [x13] ; CHECK-BE-NEXT: st1 { v2.2d }, [x17] ; CHECK-BE-NEXT: b.ne .LBB17_1 ; CHECK-BE-NEXT: // %bb.2: // %exit diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s index 73eadc268bf26a..f703392f3e9d05 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s @@ -1070,14 +1070,14 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 1 4 0.50 abs d29, d24 -# CHECK-NEXT: 1 4 0.50 abs v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 abs v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 abs v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 abs v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 abs v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 abs v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 abs v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 abs d29, d24 +# CHECK-NEXT: 1 3 0.50 abs v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 abs v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 abs v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 abs v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 abs v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 abs v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 abs v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 add d17, d31, d29 # CHECK-NEXT: 1 3 0.50 add v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 addhn v0.2s, v0.2d, v0.2d @@ -1086,8 +1086,8 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 addhn2 v0.16b, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 addhn2 v0.4s, v0.2d, v0.2d # CHECK-NEXT: 1 4 0.50 addhn2 v0.8h, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 addp v0.2d, v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 addp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 addp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 addp v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 3 0.50 and v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 3 0.50 bic v0.4h, #15, lsl #8 # CHECK-NEXT: 1 3 0.50 bic v0.8b, v0.8b, v0.8b @@ -1441,13 +1441,13 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 mvni v0.2s, #0 # CHECK-NEXT: 1 3 0.50 mvni v0.4s, #16, msl #16 # CHECK-NEXT: 1 3 0.50 neg d29, d24 -# CHECK-NEXT: 1 4 0.50 neg v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 neg v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 neg v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 neg v0.2d, v0.2d # CHECK-NEXT: 1 3 0.50 neg v0.2s, v0.2s # CHECK-NEXT: 1 3 0.50 neg v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 neg v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 neg v0.4s, v0.4s # CHECK-NEXT: 1 3 0.50 neg v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 neg v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 neg v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 mvn v0.16b, v0.16b # CHECK-NEXT: 1 3 0.50 mvn v0.8b, v0.8b # CHECK-NEXT: 1 3 0.50 orn v0.16b, v0.16b, v0.16b @@ -1457,12 +1457,12 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 pmul v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 pmull v0.8h, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 pmull2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 raddhn v0.2s, v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 raddhn v0.4h, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 raddhn v0.8b, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 raddhn2 v0.16b, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 raddhn2 v0.4s, v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 raddhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: 1 8 0.50 raddhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: 1 8 0.50 raddhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: 1 8 0.50 raddhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: 1 8 0.50 raddhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: 1 8 0.50 raddhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: 1 8 0.50 raddhn2 v0.8h, v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 rbit v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 rbit v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 rev16 v21.8b, v1.8b @@ -1483,19 +1483,19 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 rshrn2 v0.16b, v0.8h, #3 # CHECK-NEXT: 1 4 0.50 rshrn2 v0.4s, v0.2d, #3 # CHECK-NEXT: 1 4 0.50 rshrn2 v0.8h, v0.4s, #3 -# CHECK-NEXT: 1 4 0.50 rsubhn v0.2s, v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 rsubhn v0.4h, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 rsubhn v0.8b, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 rsubhn2 v0.16b, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 rsubhn2 v0.4s, v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 rsubhn2 v0.8h, v0.4s, v0.4s -# CHECK-NEXT: 1 8 0.50 saba v0.16b, v0.16b, v0.16b -# CHECK-NEXT: 1 8 0.50 sabal v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 8 0.50 sabal v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 8 0.50 sabal v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 8 0.50 sabal2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 8 0.50 sabal2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 8 0.50 sabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 8 0.50 rsubhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: 1 8 0.50 rsubhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: 1 8 0.50 rsubhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: 1 8 0.50 rsubhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: 1 8 0.50 rsubhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: 1 8 0.50 rsubhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: 1 6 0.50 saba v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 6 0.50 sabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 6 0.50 sabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 6 0.50 sabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 6 0.50 sabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 6 0.50 sabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 6 0.50 sabal2 v0.8h, v0.16b, v0.16b # CHECK-NEXT: 1 3 0.50 sabd v0.4h, v0.4h, v0.4h # CHECK-NEXT: 1 3 0.50 sabdl v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 3 0.50 sabdl v0.4s, v0.4h, v0.4h @@ -1503,30 +1503,30 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 sabdl2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: 1 3 0.50 sabdl2 v0.4s, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 sabdl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 8 1.00 sadalp v0.1d, v0.2s -# CHECK-NEXT: 1 8 1.00 sadalp v0.2d, v0.4s -# CHECK-NEXT: 1 8 1.00 sadalp v0.2s, v0.4h -# CHECK-NEXT: 1 8 1.00 sadalp v0.4h, v0.8b -# CHECK-NEXT: 1 8 1.00 sadalp v0.4s, v0.8h -# CHECK-NEXT: 1 8 1.00 sadalp v0.8h, v0.16b -# CHECK-NEXT: 1 4 0.50 saddl v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 saddl v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 saddl v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 saddl2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 saddl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 saddl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 saddlp v0.1d, v0.2s -# CHECK-NEXT: 1 4 0.50 saddlp v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 saddlp v0.2s, v0.4h -# CHECK-NEXT: 1 4 0.50 saddlp v0.4h, v0.8b -# CHECK-NEXT: 1 4 0.50 saddlp v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 saddlp v0.8h, v0.16b -# CHECK-NEXT: 1 4 0.50 saddw v0.2d, v0.2d, v0.2s -# CHECK-NEXT: 1 4 0.50 saddw v0.4s, v0.4s, v0.4h -# CHECK-NEXT: 1 4 0.50 saddw v0.8h, v0.8h, v0.8b -# CHECK-NEXT: 1 4 0.50 saddw2 v0.2d, v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 saddw2 v0.4s, v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 saddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 7 1.00 sadalp v0.1d, v0.2s +# CHECK-NEXT: 1 7 1.00 sadalp v0.2d, v0.4s +# CHECK-NEXT: 1 7 1.00 sadalp v0.2s, v0.4h +# CHECK-NEXT: 1 7 1.00 sadalp v0.4h, v0.8b +# CHECK-NEXT: 1 7 1.00 sadalp v0.4s, v0.8h +# CHECK-NEXT: 1 7 1.00 sadalp v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 saddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 saddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 saddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 saddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 saddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 saddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 saddlp v0.1d, v0.2s +# CHECK-NEXT: 1 3 0.50 saddlp v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 saddlp v0.2s, v0.4h +# CHECK-NEXT: 1 3 0.50 saddlp v0.4h, v0.8b +# CHECK-NEXT: 1 3 0.50 saddlp v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 saddlp v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 saddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 saddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 saddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 saddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 saddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 saddw2 v0.8h, v0.8h, v0.16b # CHECK-NEXT: 1 4 0.50 scvtf d21, d12 # CHECK-NEXT: 1 4 0.50 scvtf d21, d12, #64 # CHECK-NEXT: 1 4 0.50 scvtf s22, s13 @@ -1573,18 +1573,18 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 sli v0.4s, v0.4s, #3 # CHECK-NEXT: 1 4 0.50 sli v0.8b, v0.8b, #3 # CHECK-NEXT: 1 4 0.50 sli v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 4 0.50 smax v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 smax v0.4h, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 smax v0.8b, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 smaxp v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 smaxp v0.4h, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 smaxp v0.8b, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 smin v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 smax v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 smax v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 smax v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 smaxp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 smaxp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 smaxp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 smin v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 smin v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 smin v0.8h, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 sminp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 smin v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 sminp v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 sminp v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 sminp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 sminp v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 smlal v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 smlal v0.4s, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 smlal v0.8h, v0.8b, v0.8b @@ -1777,14 +1777,14 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 srshr v0.4s, v0.4s, #3 # CHECK-NEXT: 1 3 0.50 srshr v0.8b, v0.8b, #3 # CHECK-NEXT: 1 3 0.50 srshr v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 8 1.00 srsra d15, d11, #19 -# CHECK-NEXT: 1 8 1.00 srsra v0.16b, v0.16b, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.2d, v0.2d, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.2s, v0.2s, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.4h, v0.4h, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.4s, v0.4s, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.8b, v0.8b, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 7 1.00 srsra d15, d11, #19 +# CHECK-NEXT: 1 7 1.00 srsra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.8h, v0.8h, #3 # CHECK-NEXT: 1 3 0.50 sshl d31, d31, d31 # CHECK-NEXT: 1 3 0.50 sshl v0.2d, v0.2d, v0.2d # CHECK-NEXT: 1 3 0.50 sshl v0.2s, v0.2s, v0.2s @@ -1800,26 +1800,26 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 sshr v0.4s, v0.4s, #3 # CHECK-NEXT: 1 3 0.50 sshr v0.8b, v0.8b, #3 # CHECK-NEXT: 1 3 0.50 sshr v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 8 1.00 ssra d18, d12, #21 -# CHECK-NEXT: 1 8 1.00 ssra v0.16b, v0.16b, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.2d, v0.2d, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.2s, v0.2s, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.4h, v0.4h, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.4s, v0.4s, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.8b, v0.8b, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 4 0.50 ssubl v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 ssubl v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 ssubl v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 ssubl2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 ssubl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 ssubl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 ssubw v0.2d, v0.2d, v0.2s -# CHECK-NEXT: 1 4 0.50 ssubw v0.4s, v0.4s, v0.4h -# CHECK-NEXT: 1 4 0.50 ssubw v0.8h, v0.8h, v0.8b -# CHECK-NEXT: 1 4 0.50 ssubw2 v0.2d, v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 ssubw2 v0.4s, v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 ssubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 ssra d18, d12, #21 +# CHECK-NEXT: 1 3 0.50 ssra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 ssubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 ssubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 ssubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 ssubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 ssubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 ssubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 ssubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 ssubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 ssubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 ssubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 ssubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 ssubw2 v0.8h, v0.8h, v0.16b # CHECK-NEXT: 1 4 1.00 * st1 { v0.16b }, [x0] # CHECK-NEXT: 2 5 2.00 * st1 { v0.2d, v1.2d, v2.2d }, [x0], #48 # CHECK-NEXT: 1 5 4.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] @@ -1843,7 +1843,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 5 2.00 * st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0] # CHECK-NEXT: 2 5 2.00 * st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5 # CHECK-NEXT: 1 3 0.50 sub d15, d5, d16 -# CHECK-NEXT: 1 4 0.50 sub v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 sub v0.2d, v0.2d, v0.2d # CHECK-NEXT: 1 4 0.50 suqadd b19, b14 # CHECK-NEXT: 1 4 0.50 suqadd d18, d22 # CHECK-NEXT: 1 4 0.50 suqadd h20, h15 @@ -1885,13 +1885,13 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 trn2 v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 trn2 v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 trn2 v0.8h, v0.8h, v0.8h -# CHECK-NEXT: 1 8 0.50 uaba v0.8b, v0.8b, v0.8b -# CHECK-NEXT: 1 8 0.50 uabal v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 8 0.50 uabal v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 8 0.50 uabal v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 8 0.50 uabal2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 8 0.50 uabal2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 8 0.50 uabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 6 0.50 uaba v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 6 0.50 uabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 6 0.50 uabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 6 0.50 uabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 6 0.50 uabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 6 0.50 uabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 6 0.50 uabal2 v0.8h, v0.16b, v0.16b # CHECK-NEXT: 1 3 0.50 uabd v0.4h, v0.4h, v0.4h # CHECK-NEXT: 1 3 0.50 uabdl v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 3 0.50 uabdl v0.4s, v0.4h, v0.4h @@ -1899,30 +1899,30 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 uabdl2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: 1 3 0.50 uabdl2 v0.4s, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 uabdl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 8 1.00 uadalp v0.1d, v0.2s -# CHECK-NEXT: 1 8 1.00 uadalp v0.2d, v0.4s -# CHECK-NEXT: 1 8 1.00 uadalp v0.2s, v0.4h -# CHECK-NEXT: 1 8 1.00 uadalp v0.4h, v0.8b -# CHECK-NEXT: 1 8 1.00 uadalp v0.4s, v0.8h -# CHECK-NEXT: 1 8 1.00 uadalp v0.8h, v0.16b -# CHECK-NEXT: 1 4 0.50 uaddl v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 uaddl v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 uaddl v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 uaddl2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 uaddl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 uaddl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 uaddlp v0.1d, v0.2s -# CHECK-NEXT: 1 4 0.50 uaddlp v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 uaddlp v0.2s, v0.4h -# CHECK-NEXT: 1 4 0.50 uaddlp v0.4h, v0.8b -# CHECK-NEXT: 1 4 0.50 uaddlp v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 uaddlp v0.8h, v0.16b -# CHECK-NEXT: 1 4 0.50 uaddw v0.2d, v0.2d, v0.2s -# CHECK-NEXT: 1 4 0.50 uaddw v0.4s, v0.4s, v0.4h -# CHECK-NEXT: 1 4 0.50 uaddw v0.8h, v0.8h, v0.8b -# CHECK-NEXT: 1 4 0.50 uaddw2 v0.2d, v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 uaddw2 v0.4s, v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 uaddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 7 1.00 uadalp v0.1d, v0.2s +# CHECK-NEXT: 1 7 1.00 uadalp v0.2d, v0.4s +# CHECK-NEXT: 1 7 1.00 uadalp v0.2s, v0.4h +# CHECK-NEXT: 1 7 1.00 uadalp v0.4h, v0.8b +# CHECK-NEXT: 1 7 1.00 uadalp v0.4s, v0.8h +# CHECK-NEXT: 1 7 1.00 uadalp v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 uaddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 uaddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 uaddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 uaddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 uaddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 uaddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uaddlp v0.1d, v0.2s +# CHECK-NEXT: 1 3 0.50 uaddlp v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 uaddlp v0.2s, v0.4h +# CHECK-NEXT: 1 3 0.50 uaddlp v0.4h, v0.8b +# CHECK-NEXT: 1 3 0.50 uaddlp v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 uaddlp v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 uaddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 uaddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 uaddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 uaddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 uaddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 uaddw2 v0.8h, v0.8h, v0.16b # CHECK-NEXT: 1 4 0.50 ucvtf d21, d14 # CHECK-NEXT: 1 4 0.50 ucvtf d21, d14, #64 # CHECK-NEXT: 1 4 0.50 ucvtf s22, s13 @@ -1935,21 +1935,21 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 ucvtf v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 ucvtf v0.4s, v0.4s, #3 # CHECK-NEXT: 1 4 0.50 ucvtf v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 uhadd v0.16b, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 uhadd v0.8h, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 uhsub v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 umax v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 uhsub v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 umax v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 umax v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 umax v0.8h, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 umaxp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 umax v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 umaxp v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 umaxp v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 umaxp v0.8h, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 umin v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 umin v0.4h, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 umin v0.8b, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 uminp v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 uminp v0.4h, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 uminp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 umaxp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 umin v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 umin v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 umin v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 uminp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 uminp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 uminp v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 umlal v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 umlal v0.4s, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 umlal v0.8h, v0.8b, v0.8b @@ -2024,9 +2024,9 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 uqxtn2 v0.8h, v0.4s # CHECK-NEXT: 1 4 0.50 urecpe v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 urecpe v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 urhadd v0.16b, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 urhadd v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 urhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 urhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 urhadd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 urhadd v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 urshl d8, d7, d4 # CHECK-NEXT: 1 3 0.50 urshl v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 3 0.50 urshl v0.2d, v0.2d, v0.2d @@ -2042,14 +2042,14 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 urshr v0.8h, v0.8h, #3 # CHECK-NEXT: 1 12 9.00 ursqrte v0.2s, v0.2s # CHECK-NEXT: 1 12 9.00 ursqrte v0.4s, v0.4s -# CHECK-NEXT: 1 8 1.00 ursra d18, d10, #13 -# CHECK-NEXT: 1 8 1.00 ursra v0.16b, v0.16b, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.2d, v0.2d, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.2s, v0.2s, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.4h, v0.4h, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.4s, v0.4s, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.8b, v0.8b, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 7 1.00 ursra d18, d10, #13 +# CHECK-NEXT: 1 7 1.00 ursra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.8h, v0.8h, #3 # CHECK-NEXT: 1 3 0.50 ushl d0, d0, d0 # CHECK-NEXT: 1 3 0.50 ushl v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 3 0.50 ushl v0.4s, v0.4s, v0.4s @@ -2075,26 +2075,26 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 usqadd v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 usqadd v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 usqadd v0.8h, v0.8h -# CHECK-NEXT: 1 8 1.00 usra d20, d13, #61 -# CHECK-NEXT: 1 8 1.00 usra v0.16b, v0.16b, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.2d, v0.2d, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.2s, v0.2s, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.4h, v0.4h, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.4s, v0.4s, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.8b, v0.8b, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 4 0.50 usubl v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 usubl v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 usubl v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 usubl2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 usubl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 usubl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 usubw v0.2d, v0.2d, v0.2s -# CHECK-NEXT: 1 4 0.50 usubw v0.4s, v0.4s, v0.4h -# CHECK-NEXT: 1 4 0.50 usubw v0.8h, v0.8h, v0.8b -# CHECK-NEXT: 1 4 0.50 usubw2 v0.2d, v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 usubw2 v0.4s, v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 usubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 usra d20, d13, #61 +# CHECK-NEXT: 1 3 0.50 usra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 usubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 usubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 usubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 usubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 usubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 usubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 usubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 usubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 usubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 usubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 usubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 usubw2 v0.8h, v0.8h, v0.16b # CHECK-NEXT: 1 4 0.50 uzp1 v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 uzp1 v0.2d, v0.2d, v0.2d # CHECK-NEXT: 1 4 0.50 uzp1 v0.2s, v0.2s, v0.2s @@ -2148,7 +2148,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] [6] [7] [8] [9] [10.0] [10.1] [11] -# CHECK-NEXT: - - - - - 39.00 91.00 - - 509.00 509.00 3.00 3.00 197.00 +# CHECK-NEXT: - - - - - 39.00 91.00 - - 501.00 501.00 3.00 3.00 197.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] [6] [7] [8] [9] [10.0] [10.1] [11] Instructions: @@ -2882,14 +2882,14 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - sshr v0.4s, v0.4s, #3 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - sshr v0.8b, v0.8b, #3 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - sshr v0.8h, v0.8h, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra d18, d12, #21 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.16b, v0.16b, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.2d, v0.2d, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.2s, v0.2s, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.4h, v0.4h, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.4s, v0.4s, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.8b, v0.8b, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra d18, d12, #21 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.8h, v0.8h, #3 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssubl v0.2d, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssubl v0.4s, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssubl v0.8h, v0.8b, v0.8b @@ -3157,14 +3157,14 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usqadd v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usqadd v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usqadd v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra d20, d13, #61 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.16b, v0.16b, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.2d, v0.2d, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.2s, v0.2s, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.4h, v0.4h, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.4s, v0.4s, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.8b, v0.8b, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra d20, d13, #61 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.8h, v0.8h, #3 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usubl v0.2d, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usubl v0.4s, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usubl v0.8h, v0.8b, v0.8b diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s index a8fb8b669838fa..d8051e7ecb4fe8 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s @@ -3476,12 +3476,12 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 add z31.s, p7/m, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 add z31.s, z31.s, #65280 # CHECK-NEXT: 1 3 0.50 add z31.s, z31.s, z31.s -# CHECK-NEXT: 1 4 0.50 addhnb z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 addhnb z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 addhnb z0.s, z1.d, z31.d -# CHECK-NEXT: 1 4 0.50 addhnt z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 addhnt z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 addhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 addhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 addhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 addhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 addhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 addhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 addhnt z0.s, z1.d, z31.d # CHECK-NEXT: 1 3 0.50 addp z0.b, p0/m, z0.b, z1.b # CHECK-NEXT: 1 3 0.50 addp z0.h, p0/m, z0.h, z1.h # CHECK-NEXT: 1 3 0.50 addp z29.s, p7/m, z29.s, z30.s @@ -3516,7 +3516,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 aesimc z31.b, z31.b # CHECK-NEXT: 1 3 0.50 aesmc z0.b, z0.b # CHECK-NEXT: 1 3 0.50 aesmc z31.b, z31.b -# CHECK-NEXT: 1 6 1.00 and p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 and p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 3 0.50 and z0.d, z0.d, #0x6 # CHECK-NEXT: 1 3 0.50 and z0.d, z0.d, #0xfffffffffffffff9 # CHECK-NEXT: 1 3 0.50 and z0.d, z0.d, z0.d @@ -3531,7 +3531,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 and z31.s, p7/m, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 and z5.b, z5.b, #0x6 # CHECK-NEXT: 1 3 0.50 and z5.b, z5.b, #0xf9 -# CHECK-NEXT: 1 6 1.00 ands p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 ands p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 4 1.00 andv b0, p7, z31.b # CHECK-NEXT: 1 4 1.00 andv d0, p7, z31.d # CHECK-NEXT: 1 4 1.00 andv h0, p7, z31.h @@ -3574,7 +3574,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 asrr z0.d, p0/m, z0.d, z0.d # CHECK-NEXT: 1 3 0.50 asrr z0.h, p0/m, z0.h, z0.h # CHECK-NEXT: 1 3 0.50 asrr z0.s, p0/m, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 bcax z29.d, z29.d, z30.d, z31.d +# CHECK-NEXT: 1 4 0.50 bcax z29.d, z29.d, z30.d, z31.d # CHECK-NEXT: 1 14 13.00 bdep z0.b, z1.b, z31.b # CHECK-NEXT: 1 70 69.00 bdep z0.d, z1.d, z31.d # CHECK-NEXT: 1 22 21.00 bdep z0.h, z1.h, z31.h @@ -3603,34 +3603,34 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 70 69.00 bgrp z0.d, z1.d, z31.d # CHECK-NEXT: 1 22 21.00 bgrp z0.h, z1.h, z31.h # CHECK-NEXT: 1 38 37.00 bgrp z0.s, z1.s, z31.s -# CHECK-NEXT: 1 6 1.00 bic p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 bic p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 bic p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 bic p15.b, p15/z, p15.b, p15.b # CHECK-NEXT: 1 3 0.50 bic z0.d, z0.d, z0.d # CHECK-NEXT: 1 3 0.50 bic z23.d, z13.d, z8.d # CHECK-NEXT: 1 3 0.50 bic z31.b, p7/m, z31.b, z31.b # CHECK-NEXT: 1 3 0.50 bic z31.d, p7/m, z31.d, z31.d # CHECK-NEXT: 1 3 0.50 bic z31.h, p7/m, z31.h, z31.h # CHECK-NEXT: 1 3 0.50 bic z31.s, p7/m, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 bics p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 bics p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brka p0.b, p15/m, p15.b -# CHECK-NEXT: 1 6 1.00 brka p0.b, p15/z, p15.b -# CHECK-NEXT: 1 6 1.00 brkas p0.b, p15/z, p15.b -# CHECK-NEXT: 1 6 1.00 brkb p0.b, p15/m, p15.b -# CHECK-NEXT: 1 6 1.00 brkb p0.b, p15/z, p15.b -# CHECK-NEXT: 1 6 1.00 brkbs p0.b, p15/z, p15.b -# CHECK-NEXT: 1 6 1.00 brkn p0.b, p15/z, p1.b, p0.b -# CHECK-NEXT: 1 6 1.00 brkn p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brkns p0.b, p15/z, p1.b, p0.b -# CHECK-NEXT: 1 6 1.00 brkns p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brkpa p0.b, p15/z, p1.b, p2.b -# CHECK-NEXT: 1 6 1.00 brkpa p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brkpas p0.b, p15/z, p1.b, p2.b -# CHECK-NEXT: 1 6 1.00 brkpas p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brkpb p0.b, p15/z, p1.b, p2.b -# CHECK-NEXT: 1 6 1.00 brkpb p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brkpbs p0.b, p15/z, p1.b, p2.b -# CHECK-NEXT: 1 6 1.00 brkpbs p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 bics p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 bics p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brka p0.b, p15/m, p15.b +# CHECK-NEXT: 1 2 1.00 brka p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkas p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkb p0.b, p15/m, p15.b +# CHECK-NEXT: 1 2 1.00 brkb p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkbs p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkn p0.b, p15/z, p1.b, p0.b +# CHECK-NEXT: 1 2 1.00 brkn p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkns p0.b, p15/z, p1.b, p0.b +# CHECK-NEXT: 1 2 1.00 brkns p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkpa p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 2 1.00 brkpa p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 4 1.00 brkpas p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 4 1.00 brkpas p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkpb p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 2 1.00 brkpb p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 4 1.00 brkpbs p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 4 1.00 brkpbs p15.b, p15/z, p15.b, p15.b # CHECK-NEXT: 1 3 0.50 bsl z0.d, z0.d, z1.d, z2.d # CHECK-NEXT: 1 3 0.50 bsl1n z0.d, z0.d, z1.d, z2.d # CHECK-NEXT: 1 3 0.50 bsl2n z0.d, z0.d, z1.d, z2.d @@ -3704,163 +3704,163 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 cmla z31.h, z31.h, z31.h, #180 # CHECK-NEXT: 1 4 0.50 cmla z31.s, z30.s, z7.s[0], #180 # CHECK-NEXT: 1 4 0.50 cmla z31.s, z31.s, z31.s, #180 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpeq p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmpeq p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpeq p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmpeq p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpeq p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpeq p0.s, p0/z, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmpge p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmpge p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmpge p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpge p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmpge p0.b, p0/z, z1.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpge p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmpge p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmpge p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpge p0.d, p0/z, z1.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpge p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmpge p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmpge p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmpge p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpge p0.h, p0/z, z1.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpge p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmpge p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmpge p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpge p0.s, p0/z, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmpge p0.s, p0/z, z1.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmpgt p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpgt p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmpgt p0.b, p0/z, z1.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpgt p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpgt p0.d, p0/z, z1.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpgt p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmpgt p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpgt p0.h, p0/z, z1.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpgt p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpgt p0.s, p0/z, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmpgt p0.s, p0/z, z1.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmphi p0.b, p0/z, z0.b, #0 -# CHECK-NEXT: 1 3 0.50 cmphi p0.b, p0/z, z0.b, #127 -# CHECK-NEXT: 1 3 0.50 cmphi p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmphi p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmphi p0.b, p0/z, z1.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmphi p0.d, p0/z, z0.d, #0 -# CHECK-NEXT: 1 3 0.50 cmphi p0.d, p0/z, z0.d, #127 -# CHECK-NEXT: 1 3 0.50 cmphi p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmphi p0.d, p0/z, z1.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmphi p0.h, p0/z, z0.h, #0 -# CHECK-NEXT: 1 3 0.50 cmphi p0.h, p0/z, z0.h, #127 -# CHECK-NEXT: 1 3 0.50 cmphi p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmphi p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmphi p0.h, p0/z, z1.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmphi p0.s, p0/z, z0.s, #0 -# CHECK-NEXT: 1 3 0.50 cmphi p0.s, p0/z, z0.s, #127 -# CHECK-NEXT: 1 3 0.50 cmphi p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmphi p0.s, p0/z, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmphi p0.s, p0/z, z1.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmphs p0.b, p0/z, z0.b, #0 -# CHECK-NEXT: 1 3 0.50 cmphs p0.b, p0/z, z0.b, #127 -# CHECK-NEXT: 1 3 0.50 cmphs p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmphs p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmphs p0.b, p0/z, z1.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmphs p0.d, p0/z, z0.d, #0 -# CHECK-NEXT: 1 3 0.50 cmphs p0.d, p0/z, z0.d, #127 -# CHECK-NEXT: 1 3 0.50 cmphs p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmphs p0.d, p0/z, z1.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmphs p0.h, p0/z, z0.h, #0 -# CHECK-NEXT: 1 3 0.50 cmphs p0.h, p0/z, z0.h, #127 -# CHECK-NEXT: 1 3 0.50 cmphs p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmphs p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmphs p0.h, p0/z, z1.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmphs p0.s, p0/z, z0.s, #0 -# CHECK-NEXT: 1 3 0.50 cmphs p0.s, p0/z, z0.s, #127 -# CHECK-NEXT: 1 3 0.50 cmphs p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmphs p0.s, p0/z, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmphs p0.s, p0/z, z1.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmple p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmple p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmple p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmple p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmple p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmple p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmple p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmple p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmple p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmple p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmple p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmplo p0.b, p0/z, z0.b, #0 -# CHECK-NEXT: 1 3 0.50 cmplo p0.b, p0/z, z0.b, #127 -# CHECK-NEXT: 1 3 0.50 cmplo p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmplo p0.d, p0/z, z0.d, #0 -# CHECK-NEXT: 1 3 0.50 cmplo p0.d, p0/z, z0.d, #127 -# CHECK-NEXT: 1 3 0.50 cmplo p0.h, p0/z, z0.h, #0 -# CHECK-NEXT: 1 3 0.50 cmplo p0.h, p0/z, z0.h, #127 -# CHECK-NEXT: 1 3 0.50 cmplo p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmplo p0.s, p0/z, z0.s, #0 -# CHECK-NEXT: 1 3 0.50 cmplo p0.s, p0/z, z0.s, #127 -# CHECK-NEXT: 1 3 0.50 cmplo p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpls p0.b, p0/z, z0.b, #0 -# CHECK-NEXT: 1 3 0.50 cmpls p0.b, p0/z, z0.b, #127 -# CHECK-NEXT: 1 3 0.50 cmpls p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmpls p0.d, p0/z, z0.d, #0 -# CHECK-NEXT: 1 3 0.50 cmpls p0.d, p0/z, z0.d, #127 -# CHECK-NEXT: 1 3 0.50 cmpls p0.h, p0/z, z0.h, #0 -# CHECK-NEXT: 1 3 0.50 cmpls p0.h, p0/z, z0.h, #127 -# CHECK-NEXT: 1 3 0.50 cmpls p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmpls p0.s, p0/z, z0.s, #0 -# CHECK-NEXT: 1 3 0.50 cmpls p0.s, p0/z, z0.s, #127 -# CHECK-NEXT: 1 3 0.50 cmpls p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmplt p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmplt p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmplt p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmplt p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmplt p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmplt p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmplt p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmplt p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmplt p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmplt p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmplt p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpne p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmpne p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmpne p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpne p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmpne p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmpne p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmpne p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpne p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmpne p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmpne p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmpne p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpne p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmpne p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmpne p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpne p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmpeq p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpeq p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmpeq p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpeq p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmpeq p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpeq p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpeq p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmpge p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmpge p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmpge p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpge p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmpge p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpge p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmpge p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmpge p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpge p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpge p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmpge p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmpge p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmpge p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpge p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpge p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmpge p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmpge p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpge p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmpge p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmpgt p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpgt p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmpgt p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpgt p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpgt p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpgt p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmpgt p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpgt p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpgt p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpgt p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmpgt p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmphi p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 cmphi p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 4 0.50 cmphi p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmphi p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmphi p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmphi p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 cmphi p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 4 0.50 cmphi p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmphi p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmphi p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 cmphi p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 4 0.50 cmphi p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmphi p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmphi p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmphi p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 cmphi p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 4 0.50 cmphi p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmphi p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmphi p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmphs p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 cmphs p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 4 0.50 cmphs p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmphs p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmphs p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmphs p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 cmphs p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 4 0.50 cmphs p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmphs p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmphs p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 cmphs p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 4 0.50 cmphs p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmphs p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmphs p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmphs p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 cmphs p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 4 0.50 cmphs p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmphs p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmphs p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmple p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmple p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmple p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmple p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmple p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmple p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmple p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmple p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmple p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmple p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmple p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmplo p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 cmplo p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 4 0.50 cmplo p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmplo p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 cmplo p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 4 0.50 cmplo p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 cmplo p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 4 0.50 cmplo p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmplo p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 cmplo p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 4 0.50 cmplo p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpls p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 cmpls p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 4 0.50 cmpls p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmpls p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 cmpls p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 4 0.50 cmpls p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 cmpls p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 4 0.50 cmpls p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmpls p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 cmpls p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 4 0.50 cmpls p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmplt p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmplt p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmplt p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmplt p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmplt p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmplt p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmplt p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmplt p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmplt p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmplt p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmplt p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpne p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmpne p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmpne p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpne p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmpne p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmpne p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmpne p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpne p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmpne p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmpne p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmpne p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpne p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmpne p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmpne p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpne p0.s, p0/z, z0.s, z0.s # CHECK-NEXT: 1 3 0.50 cnot z31.b, p7/m, z31.b # CHECK-NEXT: 1 3 0.50 cnot z31.d, p7/m, z31.d # CHECK-NEXT: 1 3 0.50 cnot z31.h, p7/m, z31.h # CHECK-NEXT: 1 3 0.50 cnot z31.s, p7/m, z31.s -# CHECK-NEXT: 1 4 0.50 cnt z31.b, p7/m, z31.b +# CHECK-NEXT: 1 3 0.50 cnt z31.b, p7/m, z31.b # CHECK-NEXT: 1 12 0.50 cnt z31.d, p7/m, z31.d -# CHECK-NEXT: 1 4 0.50 cnt z31.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 cnt z31.h, p7/m, z31.h # CHECK-NEXT: 1 8 0.50 cnt z31.s, p7/m, z31.s # CHECK-NEXT: 1 1 0.33 cntb x0 # CHECK-NEXT: 1 1 0.33 cntb x0, #28 @@ -3874,10 +3874,10 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 1 0.33 cnth x0, #28 # CHECK-NEXT: 1 1 0.33 cnth x0, all, mul #16 # CHECK-NEXT: 1 1 0.33 cnth x0, pow2 -# CHECK-NEXT: 1 6 1.00 cntp x0, p15, p0.b -# CHECK-NEXT: 1 6 1.00 cntp x0, p15, p0.d -# CHECK-NEXT: 1 6 1.00 cntp x0, p15, p0.h -# CHECK-NEXT: 1 6 1.00 cntp x0, p15, p0.s +# CHECK-NEXT: 1 4 1.00 cntp x0, p15, p0.b +# CHECK-NEXT: 1 4 1.00 cntp x0, p15, p0.d +# CHECK-NEXT: 1 4 1.00 cntp x0, p15, p0.h +# CHECK-NEXT: 1 4 1.00 cntp x0, p15, p0.s # CHECK-NEXT: 1 1 0.33 cntw x0 # CHECK-NEXT: 1 1 0.33 cntw x0, #28 # CHECK-NEXT: 1 1 0.33 cntw x0, all, mul #16 @@ -3892,42 +3892,42 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 1 0.33 ctermne wzr, w30 # CHECK-NEXT: 1 1 0.33 ctermne x30, xzr # CHECK-NEXT: 1 1 0.33 ctermne xzr, x30 -# CHECK-NEXT: 1 1 0.33 decb x0 -# CHECK-NEXT: 1 1 0.33 decb x0, #14 -# CHECK-NEXT: 1 1 0.33 decb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 decb x0, pow2 -# CHECK-NEXT: 1 1 0.33 decb x0, vl1 -# CHECK-NEXT: 1 1 0.33 decd x0 -# CHECK-NEXT: 1 1 0.33 decd x0, #14 -# CHECK-NEXT: 1 1 0.33 decd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 decd x0, pow2 -# CHECK-NEXT: 1 1 0.33 decd x0, vl1 -# CHECK-NEXT: 1 1 0.33 dech x0 -# CHECK-NEXT: 1 1 0.33 dech x0, #14 -# CHECK-NEXT: 1 1 0.33 dech x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 dech x0, pow2 -# CHECK-NEXT: 1 1 0.33 dech x0, vl1 -# CHECK-NEXT: 1 6 1.00 decp x0, p0.b -# CHECK-NEXT: 1 6 1.00 decp x0, p0.d -# CHECK-NEXT: 1 6 1.00 decp x0, p0.h -# CHECK-NEXT: 1 6 1.00 decp x0, p0.s -# CHECK-NEXT: 1 6 1.00 decp xzr, p15.b -# CHECK-NEXT: 1 6 1.00 decp xzr, p15.d -# CHECK-NEXT: 1 6 1.00 decp xzr, p15.h -# CHECK-NEXT: 1 6 1.00 decp xzr, p15.s +# CHECK-NEXT: 1 3 0.33 decb x0 +# CHECK-NEXT: 1 3 0.33 decb x0, #14 +# CHECK-NEXT: 1 3 0.33 decb x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 decb x0, pow2 +# CHECK-NEXT: 1 3 0.33 decb x0, vl1 +# CHECK-NEXT: 1 3 0.33 decd x0 +# CHECK-NEXT: 1 3 0.33 decd x0, #14 +# CHECK-NEXT: 1 3 0.33 decd x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 decd x0, pow2 +# CHECK-NEXT: 1 3 0.33 decd x0, vl1 +# CHECK-NEXT: 1 3 0.33 dech x0 +# CHECK-NEXT: 1 3 0.33 dech x0, #14 +# CHECK-NEXT: 1 3 0.33 dech x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 dech x0, pow2 +# CHECK-NEXT: 1 3 0.33 dech x0, vl1 +# CHECK-NEXT: 1 4 1.00 decp x0, p0.b +# CHECK-NEXT: 1 4 1.00 decp x0, p0.d +# CHECK-NEXT: 1 4 1.00 decp x0, p0.h +# CHECK-NEXT: 1 4 1.00 decp x0, p0.s +# CHECK-NEXT: 1 4 1.00 decp xzr, p15.b +# CHECK-NEXT: 1 4 1.00 decp xzr, p15.d +# CHECK-NEXT: 1 4 1.00 decp xzr, p15.h +# CHECK-NEXT: 1 4 1.00 decp xzr, p15.s # CHECK-NEXT: 1 4 0.50 decp z31.d, p15.d # CHECK-NEXT: 1 4 0.50 decp z31.h, p15.h # CHECK-NEXT: 1 4 0.50 decp z31.s, p15.s -# CHECK-NEXT: 1 1 0.33 decw x0 -# CHECK-NEXT: 1 1 0.33 decw x0, #14 -# CHECK-NEXT: 1 1 0.33 decw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 decw x0, pow2 -# CHECK-NEXT: 1 1 0.33 decw x0, vl1 +# CHECK-NEXT: 1 3 0.33 decw x0 +# CHECK-NEXT: 1 3 0.33 decw x0, #14 +# CHECK-NEXT: 1 3 0.33 decw x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 decw x0, pow2 +# CHECK-NEXT: 1 3 0.33 decw x0, vl1 # CHECK-NEXT: 1 4 0.50 dupm z0.d, #0xfffffffffffffff9 # CHECK-NEXT: 1 4 0.50 dupm z0.s, #0xfffffff9 # CHECK-NEXT: 1 4 0.50 dupm z23.h, #0xfff9 # CHECK-NEXT: 1 4 0.50 dupm z5.b, #0xf9 -# CHECK-NEXT: 1 6 1.00 eor p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 eor p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 3 0.50 eor z0.d, z0.d, #0x6 # CHECK-NEXT: 1 3 0.50 eor z0.d, z0.d, #0xfffffffffffffff9 # CHECK-NEXT: 1 3 0.50 eor z0.d, z0.d, z0.d @@ -3942,12 +3942,12 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 eor z31.s, p7/m, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 eor z5.b, z5.b, #0x6 # CHECK-NEXT: 1 3 0.50 eor z5.b, z5.b, #0xf9 -# CHECK-NEXT: 1 3 0.50 eor3 z29.d, z29.d, z30.d, z31.d +# CHECK-NEXT: 1 4 0.50 eor3 z29.d, z29.d, z30.d, z31.d # CHECK-NEXT: 1 4 0.50 eorbt z0.b, z1.b, z31.b # CHECK-NEXT: 1 4 0.50 eorbt z0.d, z1.d, z31.d # CHECK-NEXT: 1 4 0.50 eorbt z0.h, z1.h, z31.h # CHECK-NEXT: 1 4 0.50 eorbt z0.s, z1.s, z31.s -# CHECK-NEXT: 1 6 1.00 eors p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 eors p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 4 0.50 eortb z0.b, z1.b, z31.b # CHECK-NEXT: 1 4 0.50 eortb z0.d, z1.d, z31.d # CHECK-NEXT: 1 4 0.50 eortb z0.h, z1.h, z31.h @@ -4303,49 +4303,49 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 ftsmul z0.d, z1.d, z31.d # CHECK-NEXT: 1 4 0.50 ftsmul z0.h, z1.h, z31.h # CHECK-NEXT: 1 4 0.50 ftsmul z0.s, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 ftssel z0.d, z1.d, z31.d -# CHECK-NEXT: 1 4 0.50 ftssel z0.h, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 ftssel z0.s, z1.s, z31.s +# CHECK-NEXT: 1 3 0.50 ftssel z0.d, z1.d, z31.d +# CHECK-NEXT: 1 3 0.50 ftssel z0.h, z1.h, z31.h +# CHECK-NEXT: 1 3 0.50 ftssel z0.s, z1.s, z31.s # CHECK-NEXT: 1 8 2.00 histcnt z0.s, p0/z, z1.s, z2.s # CHECK-NEXT: 1 8 2.00 histcnt z29.d, p7/z, z30.d, z31.d # CHECK-NEXT: 1 8 2.00 histseg z0.b, z1.b, z31.b -# CHECK-NEXT: 1 1 0.33 incb x0 -# CHECK-NEXT: 1 1 0.33 incb x0, #14 -# CHECK-NEXT: 1 1 0.33 incb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 incb x0, pow2 -# CHECK-NEXT: 1 1 0.33 incb x0, vl1 -# CHECK-NEXT: 1 1 0.33 incd x0 -# CHECK-NEXT: 1 1 0.33 incd x0, #14 -# CHECK-NEXT: 1 1 0.33 incd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 incd x0, pow2 -# CHECK-NEXT: 1 1 0.33 incd x0, vl1 -# CHECK-NEXT: 1 4 0.50 incd z0.d -# CHECK-NEXT: 1 4 0.50 incd z0.d, all, mul #16 -# CHECK-NEXT: 1 1 0.33 inch x0 -# CHECK-NEXT: 1 1 0.33 inch x0, #14 -# CHECK-NEXT: 1 1 0.33 inch x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 inch x0, pow2 -# CHECK-NEXT: 1 1 0.33 inch x0, vl1 -# CHECK-NEXT: 1 4 0.50 inch z0.h -# CHECK-NEXT: 1 4 0.50 inch z0.h, all, mul #16 -# CHECK-NEXT: 1 6 1.00 incp x0, p0.b -# CHECK-NEXT: 1 6 1.00 incp x0, p0.d -# CHECK-NEXT: 1 6 1.00 incp x0, p0.h -# CHECK-NEXT: 1 6 1.00 incp x0, p0.s -# CHECK-NEXT: 1 6 1.00 incp xzr, p15.b -# CHECK-NEXT: 1 6 1.00 incp xzr, p15.d -# CHECK-NEXT: 1 6 1.00 incp xzr, p15.h -# CHECK-NEXT: 1 6 1.00 incp xzr, p15.s +# CHECK-NEXT: 1 3 0.33 incb x0 +# CHECK-NEXT: 1 3 0.33 incb x0, #14 +# CHECK-NEXT: 1 3 0.33 incb x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 incb x0, pow2 +# CHECK-NEXT: 1 3 0.33 incb x0, vl1 +# CHECK-NEXT: 1 3 0.33 incd x0 +# CHECK-NEXT: 1 3 0.33 incd x0, #14 +# CHECK-NEXT: 1 3 0.33 incd x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 incd x0, pow2 +# CHECK-NEXT: 1 3 0.33 incd x0, vl1 +# CHECK-NEXT: 1 3 0.50 incd z0.d +# CHECK-NEXT: 1 3 0.50 incd z0.d, all, mul #16 +# CHECK-NEXT: 1 3 0.33 inch x0 +# CHECK-NEXT: 1 3 0.33 inch x0, #14 +# CHECK-NEXT: 1 3 0.33 inch x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 inch x0, pow2 +# CHECK-NEXT: 1 3 0.33 inch x0, vl1 +# CHECK-NEXT: 1 3 0.50 inch z0.h +# CHECK-NEXT: 1 3 0.50 inch z0.h, all, mul #16 +# CHECK-NEXT: 1 4 1.00 incp x0, p0.b +# CHECK-NEXT: 1 4 1.00 incp x0, p0.d +# CHECK-NEXT: 1 4 1.00 incp x0, p0.h +# CHECK-NEXT: 1 4 1.00 incp x0, p0.s +# CHECK-NEXT: 1 4 1.00 incp xzr, p15.b +# CHECK-NEXT: 1 4 1.00 incp xzr, p15.d +# CHECK-NEXT: 1 4 1.00 incp xzr, p15.h +# CHECK-NEXT: 1 4 1.00 incp xzr, p15.s # CHECK-NEXT: 1 4 0.50 incp z31.d, p15.d # CHECK-NEXT: 1 4 0.50 incp z31.h, p15.h # CHECK-NEXT: 1 4 0.50 incp z31.s, p15.s -# CHECK-NEXT: 1 1 0.33 incw x0 -# CHECK-NEXT: 1 1 0.33 incw x0, #14 -# CHECK-NEXT: 1 1 0.33 incw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 incw x0, pow2 -# CHECK-NEXT: 1 1 0.33 incw x0, vl1 -# CHECK-NEXT: 1 4 0.50 incw z0.s -# CHECK-NEXT: 1 4 0.50 incw z0.s, all, mul #16 +# CHECK-NEXT: 1 3 0.33 incw x0 +# CHECK-NEXT: 1 3 0.33 incw x0, #14 +# CHECK-NEXT: 1 3 0.33 incw x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 incw x0, pow2 +# CHECK-NEXT: 1 3 0.33 incw x0, vl1 +# CHECK-NEXT: 1 3 0.50 incw z0.s +# CHECK-NEXT: 1 3 0.50 incw z0.s, all, mul #16 # CHECK-NEXT: 1 4 0.50 index z0.b, #0, #0 # CHECK-NEXT: 1 4 0.50 index z0.d, #0, #0 # CHECK-NEXT: 1 4 0.50 index z0.h, #0, #0 @@ -4412,8 +4412,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 * ld1b { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 7 7.00 * ld1b { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * ld1b { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 9 4.50 * ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 3 0.50 * ld1b { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 9 9.00 * ld1b { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 3 0.50 * ld1b { z21.b }, p5/z, [x10, #5, mul vl] @@ -4450,8 +4450,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 * ld1h { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 7 7.00 * ld1h { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * ld1h { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 9 4.50 * ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 3 0.50 * ld1h { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 9 9.00 * ld1h { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 3 0.50 * ld1h { z21.d }, p5/z, [x10, #5, mul vl] @@ -4467,8 +4467,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * ld1h { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: 1 3 0.50 * ld1h { z31.h }, p7/z, [sp, #-1, mul vl] # CHECK-NEXT: 1 3 0.50 * ld1h { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: 1 9 4.50 * ld1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: 1 9 4.50 * ld1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 7 3.50 * ld1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 3.50 * ld1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: 1 9 9.00 * ld1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 3 0.50 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1] # CHECK-NEXT: 1 3 0.50 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1] @@ -4529,7 +4529,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 * ld1sb { z0.h }, p0/z, [sp, x0] # CHECK-NEXT: 1 3 0.50 * ld1sb { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: 1 3 0.50 * ld1sb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 9 4.50 * ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] # CHECK-NEXT: 1 3 0.50 * ld1sb { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 9 9.00 * ld1sb { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 3 0.50 * ld1sb { z21.d }, p5/z, [x10, #5, mul vl] @@ -4549,8 +4549,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] # CHECK-NEXT: 1 3 0.50 * ld1sh { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 7 7.00 * ld1sh { z0.d }, p0/z, [z0.d] -# CHECK-NEXT: 1 9 4.50 * ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 3 0.50 * ld1sh { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 9 9.00 * ld1sh { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 3 0.50 * ld1sh { z21.d }, p5/z, [x10, #5, mul vl] @@ -4565,8 +4565,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * ld1sh { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: 1 7 7.00 * ld1sh { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: 1 3 0.50 * ld1sh { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: 1 9 4.50 * ld1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: 1 9 4.50 * ld1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 7 3.50 * ld1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 3.50 * ld1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: 1 9 9.00 * ld1sh { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 7 7.00 * ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] # CHECK-NEXT: 1 7 7.00 * ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] @@ -4585,8 +4585,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] # CHECK-NEXT: 1 3 0.50 * ld1w { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 7 7.00 * ld1w { z0.d }, p0/z, [z0.d] -# CHECK-NEXT: 1 9 4.50 * ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 3 0.50 * ld1w { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 9 9.00 * ld1w { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 3 0.50 * ld1w { z21.d }, p5/z, [x10, #5, mul vl] @@ -4601,8 +4601,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * ld1w { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: 1 7 7.00 * ld1w { z31.d }, p7/z, [z31.d, #124] # CHECK-NEXT: 1 3 0.50 * ld1w { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: 1 9 4.50 * ld1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] -# CHECK-NEXT: 1 9 4.50 * ld1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: 1 7 3.50 * ld1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: 1 7 3.50 * ld1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] # CHECK-NEXT: 1 9 9.00 * ld1w { z31.s }, p7/z, [z31.s, #124] # CHECK-NEXT: 1 3 2.00 * ld2b { z0.b, z1.b }, p0/z, [x0, x0] # CHECK-NEXT: 1 3 1.00 * ld2b { z0.b, z1.b }, p0/z, [x0] @@ -4668,8 +4668,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1b { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * U ldff1b { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: 1 3 0.50 * U ldff1b { z0.s }, p0/z, [x0, x0] -# CHECK-NEXT: 1 9 4.50 * U ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * U ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 9 9.00 * U ldff1b { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 7 7.00 * U ldff1b { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: 1 7 7.00 * U ldff1b { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -4696,8 +4696,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1h { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * U ldff1h { z0.h }, p0/z, [x0, x0, lsl #1] # CHECK-NEXT: 1 3 0.50 * U ldff1h { z0.s }, p0/z, [x0, x0, lsl #1] -# CHECK-NEXT: 1 9 4.50 * U ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * U ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 9 9.00 * U ldff1h { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 7 7.00 * U ldff1h { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: 1 7 7.00 * U ldff1h { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -4706,16 +4706,16 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 * U ldff1h { z31.d }, p7/z, [sp] # CHECK-NEXT: 1 7 7.00 * U ldff1h { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: 1 3 0.50 * U ldff1h { z31.h }, p7/z, [sp] -# CHECK-NEXT: 1 9 4.50 * U ldff1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: 1 9 4.50 * U ldff1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 7 3.50 * U ldff1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 3.50 * U ldff1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: 1 3 0.50 * U ldff1h { z31.s }, p7/z, [sp] # CHECK-NEXT: 1 9 9.00 * U ldff1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 3 0.50 * U ldff1sb { z0.d }, p0/z, [x0, x0] # CHECK-NEXT: 1 7 7.00 * U ldff1sb { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * U ldff1sb { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: 1 3 0.50 * U ldff1sb { z0.s }, p0/z, [x0, x0] -# CHECK-NEXT: 1 9 4.50 * U ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * U ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 9 9.00 * U ldff1sb { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 7 7.00 * U ldff1sb { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: 1 7 7.00 * U ldff1sb { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -4730,8 +4730,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * U ldff1sh { z0.s }, p0/z, [x0, x0, lsl #1] -# CHECK-NEXT: 1 9 4.50 * U ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * U ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 9 9.00 * U ldff1sh { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -4739,8 +4739,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: 1 3 0.50 * U ldff1sh { z31.d }, p7/z, [sp] # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z31.d }, p7/z, [z31.d, #62] -# CHECK-NEXT: 1 9 4.50 * U ldff1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: 1 9 4.50 * U ldff1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 7 3.50 * U ldff1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 3.50 * U ldff1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: 1 3 0.50 * U ldff1sh { z31.s }, p7/z, [sp] # CHECK-NEXT: 1 9 9.00 * U ldff1sh { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 3 0.50 * U ldff1sw { z0.d }, p0/z, [x0, x0, lsl #2] @@ -4758,8 +4758,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] # CHECK-NEXT: 1 7 7.00 * U ldff1w { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * U ldff1w { z0.s }, p0/z, [x0, x0, lsl #2] -# CHECK-NEXT: 1 9 4.50 * U ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * U ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 9 9.00 * U ldff1w { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 7 7.00 * U ldff1w { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: 1 7 7.00 * U ldff1w { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -4767,8 +4767,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1w { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: 1 3 0.50 * U ldff1w { z31.d }, p7/z, [sp] # CHECK-NEXT: 1 7 7.00 * U ldff1w { z31.d }, p7/z, [z31.d, #124] -# CHECK-NEXT: 1 9 4.50 * U ldff1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] -# CHECK-NEXT: 1 9 4.50 * U ldff1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: 1 7 3.50 * U ldff1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: 1 7 3.50 * U ldff1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] # CHECK-NEXT: 1 3 0.50 * U ldff1w { z31.s }, p7/z, [sp] # CHECK-NEXT: 1 9 9.00 * U ldff1w { z31.s }, p7/z, [z31.s, #124] # CHECK-NEXT: 1 3 0.50 * U ldnf1b { z0.b }, p0/z, [x0] @@ -4959,12 +4959,12 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 mls z0.h, z1.h, z7.h[7] # CHECK-NEXT: 1 4 0.50 mls z0.s, p7/m, z1.s, z31.s # CHECK-NEXT: 1 4 0.50 mls z0.s, z1.s, z7.s[3] -# CHECK-NEXT: 1 6 1.00 mov p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 mov p0.b, p0/m, p0.b -# CHECK-NEXT: 1 6 1.00 mov p0.b, p0/z, p0.b -# CHECK-NEXT: 1 6 1.00 mov p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 mov p15.b, p15/m, p15.b -# CHECK-NEXT: 1 6 1.00 mov p15.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 mov p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 mov p0.b, p0/m, p0.b +# CHECK-NEXT: 1 2 1.00 mov p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 mov p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 mov p15.b, p15/m, p15.b +# CHECK-NEXT: 1 2 1.00 mov p15.b, p15/z, p15.b # CHECK-NEXT: 1 3 0.50 mov z0.b, #127 # CHECK-NEXT: 1 3 0.50 mov z0.b, b0 # CHECK-NEXT: 1 3 0.50 mov z0.b, p0/m, b0 @@ -5062,10 +5062,10 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 mov z5.h, #-6 # CHECK-NEXT: 1 3 0.50 mov z5.q, z17.q[3] # CHECK-NEXT: 1 3 0.50 mov z5.s, #-6 -# CHECK-NEXT: 1 6 1.00 movs p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 movs p0.b, p0/z, p0.b -# CHECK-NEXT: 1 6 1.00 movs p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 movs p15.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 movs p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 movs p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 movs p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 movs p15.b, p15/z, p15.b # CHECK-NEXT: 1 1 1.00 U mrs x3, ID_AA64ZFR0_EL1 # CHECK-NEXT: 1 1 1.00 U mrs x3, ZCR_EL1 # CHECK-NEXT: 1 1 1.00 U mrs x3, ZCR_EL12 @@ -5098,10 +5098,10 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 mul z31.h, z31.h, #127 # CHECK-NEXT: 1 4 0.50 mul z31.s, z31.s, #-128 # CHECK-NEXT: 1 4 0.50 mul z31.s, z31.s, #127 -# CHECK-NEXT: 1 6 1.00 nand p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 nand p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 nands p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 nands p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 nand p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nand p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 nands p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nands p15.b, p15/z, p15.b, p15.b # CHECK-NEXT: 1 3 0.50 nbsl z0.d, z0.d, z1.d, z2.d # CHECK-NEXT: 1 3 0.50 neg z0.b, p0/m, z0.b # CHECK-NEXT: 1 3 0.50 neg z0.d, p0/m, z0.d @@ -5115,23 +5115,23 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 1.00 nmatch p0.h, p0/z, z0.h, z0.h # CHECK-NEXT: 1 7 1.00 nmatch p15.b, p7/z, z30.b, z31.b # CHECK-NEXT: 1 7 1.00 nmatch p15.h, p7/z, z30.h, z31.h -# CHECK-NEXT: 1 6 1.00 nor p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 nor p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 nors p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 nors p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 not p0.b, p0/z, p0.b -# CHECK-NEXT: 1 6 1.00 not p15.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 nor p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nor p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 nors p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nors p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 not p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 not p15.b, p15/z, p15.b # CHECK-NEXT: 1 3 0.50 not z31.b, p7/m, z31.b # CHECK-NEXT: 1 3 0.50 not z31.d, p7/m, z31.d # CHECK-NEXT: 1 3 0.50 not z31.h, p7/m, z31.h # CHECK-NEXT: 1 3 0.50 not z31.s, p7/m, z31.s -# CHECK-NEXT: 1 6 1.00 nots p0.b, p0/z, p0.b -# CHECK-NEXT: 1 6 1.00 nots p15.b, p15/z, p15.b -# CHECK-NEXT: 1 6 1.00 orn p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 orn p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 orns p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 orns p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 orr p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 nots p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 nots p15.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 orn p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 orn p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 orns p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 orns p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 orr p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 3 0.50 orr z0.d, z0.d, #0x6 # CHECK-NEXT: 1 3 0.50 orr z0.d, z0.d, #0xfffffffffffffff9 # CHECK-NEXT: 1 3 0.50 orr z0.s, z0.s, #0x6 @@ -5145,27 +5145,27 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 orr z31.s, p7/m, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 orr z5.b, z5.b, #0x6 # CHECK-NEXT: 1 3 0.50 orr z5.b, z5.b, #0xf9 -# CHECK-NEXT: 1 6 1.00 orrs p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 orrs p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 4 1.00 orv b0, p7, z31.b # CHECK-NEXT: 1 4 1.00 orv d0, p7, z31.d # CHECK-NEXT: 1 4 1.00 orv h0, p7, z31.h # CHECK-NEXT: 1 4 1.00 orv s0, p7, z31.s -# CHECK-NEXT: 1 6 1.00 pfalse p15.b -# CHECK-NEXT: 1 6 1.00 pfirst p0.b, p15, p0.b -# CHECK-NEXT: 1 6 1.00 pfirst p15.b, p15, p15.b +# CHECK-NEXT: 1 2 1.00 pfalse p15.b +# CHECK-NEXT: 1 2 1.00 pfirst p0.b, p15, p0.b +# CHECK-NEXT: 1 2 1.00 pfirst p15.b, p15, p15.b # CHECK-NEXT: 1 4 0.50 pmul z0.b, z1.b, z2.b # CHECK-NEXT: 1 4 0.50 pmul z29.b, z30.b, z31.b -# CHECK-NEXT: 1 6 1.00 pmullb z0.h, z1.b, z2.b -# CHECK-NEXT: 1 6 1.00 pmullb z29.q, z30.d, z31.d -# CHECK-NEXT: 1 6 1.00 pmullb z31.d, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 pmullt z0.h, z1.b, z2.b -# CHECK-NEXT: 1 6 1.00 pmullt z29.q, z30.d, z31.d -# CHECK-NEXT: 1 6 1.00 pmullt z31.d, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 pnext p0.b, p15, p0.b -# CHECK-NEXT: 1 6 1.00 pnext p0.d, p15, p0.d -# CHECK-NEXT: 1 6 1.00 pnext p0.h, p15, p0.h -# CHECK-NEXT: 1 6 1.00 pnext p0.s, p15, p0.s -# CHECK-NEXT: 1 6 1.00 pnext p15.b, p15, p15.b +# CHECK-NEXT: 1 9 1.00 pmullb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 9 1.00 pmullb z29.q, z30.d, z31.d +# CHECK-NEXT: 1 9 1.00 pmullb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 9 1.00 pmullt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 9 1.00 pmullt z29.q, z30.d, z31.d +# CHECK-NEXT: 1 9 1.00 pmullt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 pnext p0.b, p15, p0.b +# CHECK-NEXT: 1 2 1.00 pnext p0.d, p15, p0.d +# CHECK-NEXT: 1 2 1.00 pnext p0.h, p15, p0.h +# CHECK-NEXT: 1 2 1.00 pnext p0.s, p15, p0.s +# CHECK-NEXT: 1 2 1.00 pnext p15.b, p15, p15.b # CHECK-NEXT: 1 0 0.50 * * U prfb #14, p0, [x0] # CHECK-NEXT: 1 0 0.50 * * U prfb #15, p0, [x0] # CHECK-NEXT: 1 0 0.50 * * U prfb #6, p0, [x0] @@ -5274,97 +5274,97 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 0 0.50 * * U prfw pstl2strm, p0, [x0] # CHECK-NEXT: 1 0 0.50 * * U prfw pstl3keep, p0, [x0] # CHECK-NEXT: 1 0 0.50 * * U prfw pstl3strm, p0, [x0] -# CHECK-NEXT: 1 6 1.00 ptest p15, p0.b -# CHECK-NEXT: 1 6 1.00 ptest p15, p15.b -# CHECK-NEXT: 1 6 1.00 ptrue p0.b, pow2 -# CHECK-NEXT: 1 6 1.00 ptrue p0.d, pow2 -# CHECK-NEXT: 1 6 1.00 ptrue p0.h, pow2 -# CHECK-NEXT: 1 6 1.00 ptrue p0.s, pow2 -# CHECK-NEXT: 1 6 1.00 ptrue p15.b -# CHECK-NEXT: 1 6 1.00 ptrue p15.d -# CHECK-NEXT: 1 6 1.00 ptrue p15.h -# CHECK-NEXT: 1 6 1.00 ptrue p15.s -# CHECK-NEXT: 1 6 1.00 ptrue p7.s -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #14 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #15 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #16 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #17 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #18 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #19 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #20 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #21 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #22 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #23 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #24 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #25 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #26 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #27 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #28 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, mul3 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, mul4 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl1 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl128 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl16 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl2 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl256 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl3 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl32 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl4 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl5 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl6 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl64 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl7 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl8 -# CHECK-NEXT: 1 6 1.00 ptrues p0.b, pow2 -# CHECK-NEXT: 1 6 1.00 ptrues p0.d, pow2 -# CHECK-NEXT: 1 6 1.00 ptrues p0.h, pow2 -# CHECK-NEXT: 1 6 1.00 ptrues p0.s, pow2 -# CHECK-NEXT: 1 6 1.00 ptrues p15.b -# CHECK-NEXT: 1 6 1.00 ptrues p15.d -# CHECK-NEXT: 1 6 1.00 ptrues p15.h -# CHECK-NEXT: 1 6 1.00 ptrues p15.s -# CHECK-NEXT: 1 6 1.00 ptrues p7.s -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #14 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #15 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #16 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #17 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #18 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #19 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #20 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #21 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #22 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #23 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #24 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #25 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #26 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #27 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #28 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, mul3 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, mul4 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl1 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl128 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl16 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl2 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl256 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl3 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl32 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl4 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl5 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl6 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl64 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl7 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl8 -# CHECK-NEXT: 1 6 1.00 punpkhi p0.h, p0.b -# CHECK-NEXT: 1 6 1.00 punpkhi p15.h, p15.b -# CHECK-NEXT: 1 6 1.00 punpklo p0.h, p0.b -# CHECK-NEXT: 1 6 1.00 punpklo p15.h, p15.b -# CHECK-NEXT: 1 4 0.50 raddhnb z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 raddhnb z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 raddhnb z0.s, z1.d, z31.d -# CHECK-NEXT: 1 4 0.50 raddhnt z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 raddhnt z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 raddhnt z0.s, z1.d, z31.d -# CHECK-NEXT: 1 8 1.00 rax1 z0.d, z1.d, z31.d +# CHECK-NEXT: 1 2 1.00 ptest p15, p0.b +# CHECK-NEXT: 1 2 1.00 ptest p15, p15.b +# CHECK-NEXT: 1 2 1.00 ptrue p0.b, pow2 +# CHECK-NEXT: 1 2 1.00 ptrue p0.d, pow2 +# CHECK-NEXT: 1 2 1.00 ptrue p0.h, pow2 +# CHECK-NEXT: 1 2 1.00 ptrue p0.s, pow2 +# CHECK-NEXT: 1 2 1.00 ptrue p15.b +# CHECK-NEXT: 1 2 1.00 ptrue p15.d +# CHECK-NEXT: 1 2 1.00 ptrue p15.h +# CHECK-NEXT: 1 2 1.00 ptrue p15.s +# CHECK-NEXT: 1 2 1.00 ptrue p7.s +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #14 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #15 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #16 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #17 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #18 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #19 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #20 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #21 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #22 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #23 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #24 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #25 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #26 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #27 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #28 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, mul3 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, mul4 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl1 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl128 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl16 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl2 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl256 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl3 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl32 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl4 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl5 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl6 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl64 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl7 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl8 +# CHECK-NEXT: 1 2 1.00 ptrues p0.b, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p0.d, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p0.h, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p0.s, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p15.b +# CHECK-NEXT: 1 2 1.00 ptrues p15.d +# CHECK-NEXT: 1 2 1.00 ptrues p15.h +# CHECK-NEXT: 1 2 1.00 ptrues p15.s +# CHECK-NEXT: 1 2 1.00 ptrues p7.s +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #14 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #15 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #16 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #17 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #18 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #19 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #20 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #21 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #22 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #23 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #24 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #25 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #26 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #27 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #28 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, mul3 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, mul4 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl1 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl128 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl16 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl2 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl256 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl3 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl32 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl4 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl5 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl6 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl64 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl7 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl8 +# CHECK-NEXT: 1 2 1.00 punpkhi p0.h, p0.b +# CHECK-NEXT: 1 2 1.00 punpkhi p15.h, p15.b +# CHECK-NEXT: 1 2 1.00 punpklo p0.h, p0.b +# CHECK-NEXT: 1 2 1.00 punpklo p15.h, p15.b +# CHECK-NEXT: 1 8 0.50 raddhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 raddhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 raddhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 raddhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 raddhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 raddhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 9 1.00 rax1 z0.d, z1.d, z31.d # CHECK-NEXT: 1 3 0.50 rbit z0.b, p7/m, z31.b # CHECK-NEXT: 1 3 0.50 rbit z0.d, p7/m, z31.d # CHECK-NEXT: 1 3 0.50 rbit z0.h, p7/m, z31.h @@ -5379,16 +5379,16 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 1 0.33 rdvl x21, #-32 # CHECK-NEXT: 1 1 0.33 rdvl x23, #31 # CHECK-NEXT: 1 1 0.33 rdvl xzr, #-1 -# CHECK-NEXT: 1 4 0.50 rev z0.b, z31.b -# CHECK-NEXT: 1 4 0.50 rev z0.d, z31.d -# CHECK-NEXT: 1 4 0.50 rev z0.h, z31.h -# CHECK-NEXT: 1 4 0.50 rev z0.s, z31.s -# CHECK-NEXT: 1 4 0.50 revb z0.d, p7/m, z31.d -# CHECK-NEXT: 1 4 0.50 revb z0.h, p7/m, z31.h -# CHECK-NEXT: 1 4 0.50 revb z0.s, p7/m, z31.s -# CHECK-NEXT: 1 4 0.50 revh z0.d, p7/m, z31.d -# CHECK-NEXT: 1 4 0.50 revh z0.s, p7/m, z31.s -# CHECK-NEXT: 1 4 0.50 revw z0.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 rev z0.b, z31.b +# CHECK-NEXT: 1 3 0.50 rev z0.d, z31.d +# CHECK-NEXT: 1 3 0.50 rev z0.h, z31.h +# CHECK-NEXT: 1 3 0.50 rev z0.s, z31.s +# CHECK-NEXT: 1 3 0.50 revb z0.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 revb z0.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 revb z0.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 revh z0.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 revh z0.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 revw z0.d, p7/m, z31.d # CHECK-NEXT: 1 4 0.50 rshrnb z0.b, z0.h, #1 # CHECK-NEXT: 1 4 0.50 rshrnb z0.h, z0.s, #1 # CHECK-NEXT: 1 4 0.50 rshrnb z0.s, z0.d, #1 @@ -5401,22 +5401,22 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 rshrnt z31.b, z31.h, #8 # CHECK-NEXT: 1 4 0.50 rshrnt z31.h, z31.s, #16 # CHECK-NEXT: 1 4 0.50 rshrnt z31.s, z31.d, #32 -# CHECK-NEXT: 1 4 0.50 rsubhnb z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 rsubhnb z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 rsubhnb z0.s, z1.d, z31.d -# CHECK-NEXT: 1 4 0.50 rsubhnt z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 rsubhnt z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 rsubhnt z0.s, z1.d, z31.d -# CHECK-NEXT: 1 8 1.00 saba z0.b, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 saba z0.d, z1.d, z31.d -# CHECK-NEXT: 1 8 1.00 saba z0.h, z1.h, z31.h -# CHECK-NEXT: 1 8 1.00 saba z0.s, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 sabalb z0.d, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 sabalb z0.h, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 sabalb z0.s, z1.h, z31.h -# CHECK-NEXT: 1 8 1.00 sabalt z0.d, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 sabalt z0.h, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 sabalt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 rsubhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 rsubhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 rsubhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 rsubhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 rsubhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 rsubhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 6 1.00 saba z0.b, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 saba z0.d, z1.d, z31.d +# CHECK-NEXT: 1 6 1.00 saba z0.h, z1.h, z31.h +# CHECK-NEXT: 1 6 1.00 saba z0.s, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 sabalb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 sabalb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 sabalb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 6 1.00 sabalt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 sabalt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 sabalt z0.s, z1.h, z31.h # CHECK-NEXT: 1 3 0.50 sabd z31.b, p7/m, z31.b, z31.b # CHECK-NEXT: 1 3 0.50 sabd z31.d, p7/m, z31.d, z31.d # CHECK-NEXT: 1 3 0.50 sabd z31.h, p7/m, z31.h, z31.h @@ -5430,24 +5430,24 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 1.00 sadalp z0.h, p0/m, z1.b # CHECK-NEXT: 1 7 1.00 sadalp z29.s, p0/m, z30.h # CHECK-NEXT: 1 7 1.00 sadalp z30.d, p7/m, z31.s -# CHECK-NEXT: 1 3 0.50 saddlb z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 saddlb z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 saddlb z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 saddlbt z0.d, z1.s, z31.s -# CHECK-NEXT: 1 3 0.50 saddlbt z0.h, z1.b, z31.b -# CHECK-NEXT: 1 3 0.50 saddlbt z0.s, z1.h, z31.h -# CHECK-NEXT: 1 3 0.50 saddlt z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 saddlt z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 saddlt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 saddlb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 saddlb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 saddlb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 saddlbt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 saddlbt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 saddlbt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 saddlt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 saddlt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 saddlt z31.d, z31.s, z31.s # CHECK-NEXT: 1 4 1.00 saddv d0, p7, z31.b # CHECK-NEXT: 1 4 1.00 saddv d0, p7, z31.h # CHECK-NEXT: 1 4 1.00 saddv d0, p7, z31.s -# CHECK-NEXT: 1 3 0.50 saddwb z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 saddwb z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 saddwb z31.d, z31.d, z31.s -# CHECK-NEXT: 1 3 0.50 saddwt z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 saddwt z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 saddwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 saddwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 saddwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 saddwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 saddwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 saddwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 saddwt z31.d, z31.d, z31.s # CHECK-NEXT: 1 4 0.50 sbclb z0.d, z1.d, z31.d # CHECK-NEXT: 1 4 0.50 sbclb z0.s, z1.s, z31.s # CHECK-NEXT: 1 4 0.50 sbclt z0.d, z1.d, z31.d @@ -5504,8 +5504,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 sli z31.d, z31.d, #63 # CHECK-NEXT: 1 3 0.50 sli z31.h, z31.h, #15 # CHECK-NEXT: 1 3 0.50 sli z31.s, z31.s, #31 -# CHECK-NEXT: 1 8 1.00 sm4e z0.s, z0.s, z31.s -# CHECK-NEXT: 1 8 1.00 sm4ekey z0.s, z1.s, z31.s +# CHECK-NEXT: 1 9 1.00 sm4e z0.s, z0.s, z31.s +# CHECK-NEXT: 1 9 1.00 sm4ekey z0.s, z1.s, z31.s # CHECK-NEXT: 1 3 0.50 smax z0.b, z0.b, #-128 # CHECK-NEXT: 1 3 0.50 smax z0.d, z0.d, #-128 # CHECK-NEXT: 1 3 0.50 smax z0.h, z0.h, #-128 @@ -5624,61 +5624,61 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 sqcadd z31.d, z31.d, z31.d, #270 # CHECK-NEXT: 1 4 0.50 sqcadd z31.h, z31.h, z31.h, #270 # CHECK-NEXT: 1 4 0.50 sqcadd z31.s, z31.s, z31.s, #270 -# CHECK-NEXT: 1 1 0.33 sqdecb x0 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, #14 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, w0 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecd x0 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, #14 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, w0 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecb x0 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, #14 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, w0 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecd x0 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, #14 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, w0 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqdecd z0.d # CHECK-NEXT: 1 4 0.50 sqdecd z0.d, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqdecd z0.d, pow2 # CHECK-NEXT: 1 4 0.50 sqdecd z0.d, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdech x0 -# CHECK-NEXT: 1 1 0.33 sqdech x0, #14 -# CHECK-NEXT: 1 1 0.33 sqdech x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdech x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdech x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqdech x0, w0 -# CHECK-NEXT: 1 1 0.33 sqdech x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdech x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdech x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdech x0 +# CHECK-NEXT: 1 4 0.33 sqdech x0, #14 +# CHECK-NEXT: 1 4 0.33 sqdech x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdech x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdech x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqdech x0, w0 +# CHECK-NEXT: 1 4 0.33 sqdech x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdech x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdech x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqdech z0.h # CHECK-NEXT: 1 4 0.50 sqdech z0.h, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqdech z0.h, pow2 # CHECK-NEXT: 1 4 0.50 sqdech z0.h, pow2, mul #16 -# CHECK-NEXT: 1 8 1.00 sqdecp x0, p0.b -# CHECK-NEXT: 1 8 1.00 sqdecp x0, p0.d -# CHECK-NEXT: 1 8 1.00 sqdecp x0, p0.h -# CHECK-NEXT: 1 8 1.00 sqdecp x0, p0.s -# CHECK-NEXT: 1 8 1.00 sqdecp xzr, p15.b, wzr -# CHECK-NEXT: 1 8 1.00 sqdecp xzr, p15.d, wzr -# CHECK-NEXT: 1 8 1.00 sqdecp xzr, p15.h, wzr -# CHECK-NEXT: 1 8 1.00 sqdecp xzr, p15.s, wzr +# CHECK-NEXT: 1 9 1.00 sqdecp x0, p0.b +# CHECK-NEXT: 1 9 1.00 sqdecp x0, p0.d +# CHECK-NEXT: 1 9 1.00 sqdecp x0, p0.h +# CHECK-NEXT: 1 9 1.00 sqdecp x0, p0.s +# CHECK-NEXT: 1 9 1.00 sqdecp xzr, p15.b, wzr +# CHECK-NEXT: 1 9 1.00 sqdecp xzr, p15.d, wzr +# CHECK-NEXT: 1 9 1.00 sqdecp xzr, p15.h, wzr +# CHECK-NEXT: 1 9 1.00 sqdecp xzr, p15.s, wzr # CHECK-NEXT: 1 4 0.50 sqdecp z0.d, p0.d # CHECK-NEXT: 1 4 0.50 sqdecp z0.h, p0.h # CHECK-NEXT: 1 4 0.50 sqdecp z0.s, p0.s -# CHECK-NEXT: 1 1 0.33 sqdecw x0 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, #14 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, w0 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecw x0 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, #14 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, w0 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqdecw z0.s # CHECK-NEXT: 1 4 0.50 sqdecw z0.s, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqdecw z0.s, pow2 @@ -5726,61 +5726,61 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 sqdmullt z0.s, z1.h, z7.h[7] # CHECK-NEXT: 1 4 0.50 sqdmullt z29.s, z30.h, z31.h # CHECK-NEXT: 1 4 0.50 sqdmullt z31.d, z31.s, z31.s -# CHECK-NEXT: 1 1 0.33 sqincb x0 -# CHECK-NEXT: 1 1 0.33 sqincb x0, #14 -# CHECK-NEXT: 1 1 0.33 sqincb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincb x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincb x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqincb x0, w0 -# CHECK-NEXT: 1 1 0.33 sqincb x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincb x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincb x0, w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincd x0 -# CHECK-NEXT: 1 1 0.33 sqincd x0, #14 -# CHECK-NEXT: 1 1 0.33 sqincd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincd x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincd x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqincd x0, w0 -# CHECK-NEXT: 1 1 0.33 sqincd x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincd x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincd x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincb x0 +# CHECK-NEXT: 1 4 0.33 sqincb x0, #14 +# CHECK-NEXT: 1 4 0.33 sqincb x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincb x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincb x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqincb x0, w0 +# CHECK-NEXT: 1 4 0.33 sqincb x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincb x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincb x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincd x0 +# CHECK-NEXT: 1 4 0.33 sqincd x0, #14 +# CHECK-NEXT: 1 4 0.33 sqincd x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincd x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincd x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqincd x0, w0 +# CHECK-NEXT: 1 4 0.33 sqincd x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincd x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincd x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqincd z0.d # CHECK-NEXT: 1 4 0.50 sqincd z0.d, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqincd z0.d, pow2 # CHECK-NEXT: 1 4 0.50 sqincd z0.d, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 sqinch x0 -# CHECK-NEXT: 1 1 0.33 sqinch x0, #14 -# CHECK-NEXT: 1 1 0.33 sqinch x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqinch x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqinch x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqinch x0, w0 -# CHECK-NEXT: 1 1 0.33 sqinch x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqinch x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqinch x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqinch x0 +# CHECK-NEXT: 1 4 0.33 sqinch x0, #14 +# CHECK-NEXT: 1 4 0.33 sqinch x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqinch x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqinch x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqinch x0, w0 +# CHECK-NEXT: 1 4 0.33 sqinch x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqinch x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqinch x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqinch z0.h # CHECK-NEXT: 1 4 0.50 sqinch z0.h, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqinch z0.h, pow2 # CHECK-NEXT: 1 4 0.50 sqinch z0.h, pow2, mul #16 -# CHECK-NEXT: 1 8 1.00 sqincp x0, p0.b -# CHECK-NEXT: 1 8 1.00 sqincp x0, p0.d -# CHECK-NEXT: 1 8 1.00 sqincp x0, p0.h -# CHECK-NEXT: 1 8 1.00 sqincp x0, p0.s -# CHECK-NEXT: 1 8 1.00 sqincp xzr, p15.b, wzr -# CHECK-NEXT: 1 8 1.00 sqincp xzr, p15.d, wzr -# CHECK-NEXT: 1 8 1.00 sqincp xzr, p15.h, wzr -# CHECK-NEXT: 1 8 1.00 sqincp xzr, p15.s, wzr +# CHECK-NEXT: 1 9 1.00 sqincp x0, p0.b +# CHECK-NEXT: 1 9 1.00 sqincp x0, p0.d +# CHECK-NEXT: 1 9 1.00 sqincp x0, p0.h +# CHECK-NEXT: 1 9 1.00 sqincp x0, p0.s +# CHECK-NEXT: 1 9 1.00 sqincp xzr, p15.b, wzr +# CHECK-NEXT: 1 9 1.00 sqincp xzr, p15.d, wzr +# CHECK-NEXT: 1 9 1.00 sqincp xzr, p15.h, wzr +# CHECK-NEXT: 1 9 1.00 sqincp xzr, p15.s, wzr # CHECK-NEXT: 1 4 0.50 sqincp z0.d, p0.d # CHECK-NEXT: 1 4 0.50 sqincp z0.h, p0.h # CHECK-NEXT: 1 4 0.50 sqincp z0.s, p0.s -# CHECK-NEXT: 1 1 0.33 sqincw x0 -# CHECK-NEXT: 1 1 0.33 sqincw x0, #14 -# CHECK-NEXT: 1 1 0.33 sqincw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincw x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincw x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqincw x0, w0 -# CHECK-NEXT: 1 1 0.33 sqincw x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincw x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincw x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincw x0 +# CHECK-NEXT: 1 4 0.33 sqincw x0, #14 +# CHECK-NEXT: 1 4 0.33 sqincw x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincw x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincw x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqincw x0, w0 +# CHECK-NEXT: 1 4 0.33 sqincw x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincw x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincw x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqincw z0.s # CHECK-NEXT: 1 4 0.50 sqincw z0.s, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqincw z0.s, pow2 @@ -6001,24 +6001,24 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 ssra z31.d, z31.d, #64 # CHECK-NEXT: 1 4 0.50 ssra z31.h, z31.h, #16 # CHECK-NEXT: 1 4 0.50 ssra z31.s, z31.s, #32 -# CHECK-NEXT: 1 3 0.50 ssublb z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 ssublb z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 ssublb z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 ssublbt z0.d, z1.s, z31.s -# CHECK-NEXT: 1 3 0.50 ssublbt z0.h, z1.b, z31.b -# CHECK-NEXT: 1 3 0.50 ssublbt z0.s, z1.h, z31.h -# CHECK-NEXT: 1 3 0.50 ssublt z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 ssublt z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 ssublt z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 ssubltb z0.d, z1.s, z31.s -# CHECK-NEXT: 1 3 0.50 ssubltb z0.h, z1.b, z31.b -# CHECK-NEXT: 1 3 0.50 ssubltb z0.s, z1.h, z31.h -# CHECK-NEXT: 1 3 0.50 ssubwb z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 ssubwb z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 ssubwb z31.d, z31.d, z31.s -# CHECK-NEXT: 1 3 0.50 ssubwt z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 ssubwt z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 ssubwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 ssublb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 ssublb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssublb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssublbt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssublbt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 ssublbt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssublt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 ssublt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssublt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssubltb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssubltb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 ssubltb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssubwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 ssubwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 ssubwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 ssubwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 ssubwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 ssubwt z31.d, z31.d, z31.s # CHECK-NEXT: 1 1 1.00 * st1b { z0.b }, p0, [x0, x0] # CHECK-NEXT: 1 1 1.00 * st1b { z0.b }, p0, [x0] # CHECK-NEXT: 1 1 1.00 * st1b { z0.d }, p0, [x0, x0] @@ -6250,12 +6250,12 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 sub z31.s, p7/m, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 sub z31.s, z31.s, #65280 # CHECK-NEXT: 1 3 0.50 sub z31.s, z31.s, z31.s -# CHECK-NEXT: 1 4 0.50 subhnb z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 subhnb z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 subhnb z0.s, z1.d, z31.d -# CHECK-NEXT: 1 4 0.50 subhnt z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 subhnt z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 subhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 subhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 subhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 subhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 subhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 subhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 subhnt z0.s, z1.d, z31.d # CHECK-NEXT: 1 3 0.50 subr z0.b, p0/m, z0.b, z0.b # CHECK-NEXT: 1 3 0.50 subr z0.b, z0.b, #0 # CHECK-NEXT: 1 3 0.50 subr z0.d, p0/m, z0.d, z0.d @@ -6305,32 +6305,32 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 tbx z31.d, z31.d, z31.d # CHECK-NEXT: 1 4 0.50 tbx z31.h, z31.h, z31.h # CHECK-NEXT: 1 4 0.50 tbx z31.s, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 trn1 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 trn1 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 trn1 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 trn1 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 trn1 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 trn1 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 trn1 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 trn1 z31.s, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 trn2 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 trn2 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 trn2 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 trn2 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 trn2 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 trn2 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 trn2 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 trn2 z31.s, z31.s, z31.s -# CHECK-NEXT: 1 8 1.00 uaba z0.b, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 uaba z0.d, z1.d, z31.d -# CHECK-NEXT: 1 8 1.00 uaba z0.h, z1.h, z31.h -# CHECK-NEXT: 1 8 1.00 uaba z0.s, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 uabalb z0.d, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 uabalb z0.h, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 uabalb z0.s, z1.h, z31.h -# CHECK-NEXT: 1 8 1.00 uabalt z0.d, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 uabalt z0.h, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 uabalt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 2 1.00 trn1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 trn1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 trn1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 trn1 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 trn1 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 trn1 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 trn1 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 trn1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 trn2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 trn2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 trn2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 trn2 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 trn2 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 trn2 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 trn2 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 trn2 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 6 1.00 uaba z0.b, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 uaba z0.d, z1.d, z31.d +# CHECK-NEXT: 1 6 1.00 uaba z0.h, z1.h, z31.h +# CHECK-NEXT: 1 6 1.00 uaba z0.s, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 uabalb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 uabalb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 uabalb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 6 1.00 uabalt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 uabalt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 uabalt z0.s, z1.h, z31.h # CHECK-NEXT: 1 3 0.50 uabd z31.b, p7/m, z31.b, z31.b # CHECK-NEXT: 1 3 0.50 uabd z31.d, p7/m, z31.d, z31.d # CHECK-NEXT: 1 3 0.50 uabd z31.h, p7/m, z31.h, z31.h @@ -6344,22 +6344,22 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 1.00 uadalp z0.h, p0/m, z1.b # CHECK-NEXT: 1 7 1.00 uadalp z29.s, p0/m, z30.h # CHECK-NEXT: 1 7 1.00 uadalp z30.d, p7/m, z31.s -# CHECK-NEXT: 1 3 0.50 uaddlb z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 uaddlb z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 uaddlb z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 uaddlt z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 uaddlt z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 uaddlt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 uaddlb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 uaddlb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 uaddlb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 uaddlt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 uaddlt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 uaddlt z31.d, z31.s, z31.s # CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.b # CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.d # CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.h # CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.s -# CHECK-NEXT: 1 3 0.50 uaddwb z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 uaddwb z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 uaddwb z31.d, z31.d, z31.s -# CHECK-NEXT: 1 3 0.50 uaddwt z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 uaddwt z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 uaddwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 uaddwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 uaddwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 uaddwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 uaddwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 uaddwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 uaddwt z31.d, z31.d, z31.s # CHECK-NEXT: 1 4 0.50 ucvtf z0.d, p0/m, z0.d # CHECK-NEXT: 1 4 0.50 ucvtf z0.d, p0/m, z0.s # CHECK-NEXT: 1 4 0.50 ucvtf z0.h, p0/m, z0.d @@ -6473,120 +6473,120 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 uqadd z31.d, z31.d, #65280 # CHECK-NEXT: 1 4 0.50 uqadd z31.h, z31.h, #65280 # CHECK-NEXT: 1 4 0.50 uqadd z31.s, z31.s, #65280 -# CHECK-NEXT: 1 1 0.33 uqdecb w0 -# CHECK-NEXT: 1 1 0.33 uqdecb w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecb w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecb w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecb x0 -# CHECK-NEXT: 1 1 0.33 uqdecb x0, #14 -# CHECK-NEXT: 1 1 0.33 uqdecb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecb x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecb x0, vl1 -# CHECK-NEXT: 1 1 0.33 uqdecd w0 -# CHECK-NEXT: 1 1 0.33 uqdecd w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecd w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecd w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecd x0 -# CHECK-NEXT: 1 1 0.33 uqdecd x0, #14 -# CHECK-NEXT: 1 1 0.33 uqdecd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecd x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecd x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqdecb w0 +# CHECK-NEXT: 1 4 0.33 uqdecb w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecb w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecb w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecb x0 +# CHECK-NEXT: 1 4 0.33 uqdecb x0, #14 +# CHECK-NEXT: 1 4 0.33 uqdecb x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecb x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecb x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqdecd w0 +# CHECK-NEXT: 1 4 0.33 uqdecd w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecd w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecd w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecd x0 +# CHECK-NEXT: 1 4 0.33 uqdecd x0, #14 +# CHECK-NEXT: 1 4 0.33 uqdecd x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecd x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecd x0, vl1 # CHECK-NEXT: 1 4 0.50 uqdecd z0.d # CHECK-NEXT: 1 4 0.50 uqdecd z0.d, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqdecd z0.d, pow2 # CHECK-NEXT: 1 4 0.50 uqdecd z0.d, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdech w0 -# CHECK-NEXT: 1 1 0.33 uqdech w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdech w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdech w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdech x0 -# CHECK-NEXT: 1 1 0.33 uqdech x0, #14 -# CHECK-NEXT: 1 1 0.33 uqdech x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdech x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdech x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqdech w0 +# CHECK-NEXT: 1 4 0.33 uqdech w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdech w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdech w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdech x0 +# CHECK-NEXT: 1 4 0.33 uqdech x0, #14 +# CHECK-NEXT: 1 4 0.33 uqdech x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdech x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdech x0, vl1 # CHECK-NEXT: 1 4 0.50 uqdech z0.h # CHECK-NEXT: 1 4 0.50 uqdech z0.h, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqdech z0.h, pow2 # CHECK-NEXT: 1 4 0.50 uqdech z0.h, pow2, mul #16 -# CHECK-NEXT: 1 8 1.00 uqdecp wzr, p15.b -# CHECK-NEXT: 1 8 1.00 uqdecp wzr, p15.d -# CHECK-NEXT: 1 8 1.00 uqdecp wzr, p15.h -# CHECK-NEXT: 1 8 1.00 uqdecp wzr, p15.s -# CHECK-NEXT: 1 8 1.00 uqdecp x0, p0.b -# CHECK-NEXT: 1 8 1.00 uqdecp x0, p0.d -# CHECK-NEXT: 1 8 1.00 uqdecp x0, p0.h -# CHECK-NEXT: 1 8 1.00 uqdecp x0, p0.s +# CHECK-NEXT: 1 9 1.00 uqdecp wzr, p15.b +# CHECK-NEXT: 1 9 1.00 uqdecp wzr, p15.d +# CHECK-NEXT: 1 9 1.00 uqdecp wzr, p15.h +# CHECK-NEXT: 1 9 1.00 uqdecp wzr, p15.s +# CHECK-NEXT: 1 9 1.00 uqdecp x0, p0.b +# CHECK-NEXT: 1 9 1.00 uqdecp x0, p0.d +# CHECK-NEXT: 1 9 1.00 uqdecp x0, p0.h +# CHECK-NEXT: 1 9 1.00 uqdecp x0, p0.s # CHECK-NEXT: 1 4 0.50 uqdecp z0.d, p0.d # CHECK-NEXT: 1 4 0.50 uqdecp z0.h, p0.h # CHECK-NEXT: 1 4 0.50 uqdecp z0.s, p0.s -# CHECK-NEXT: 1 1 0.33 uqdecw w0 -# CHECK-NEXT: 1 1 0.33 uqdecw w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecw w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecw w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecw x0 -# CHECK-NEXT: 1 1 0.33 uqdecw x0, #14 -# CHECK-NEXT: 1 1 0.33 uqdecw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecw x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecw x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqdecw w0 +# CHECK-NEXT: 1 4 0.33 uqdecw w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecw w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecw w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecw x0 +# CHECK-NEXT: 1 4 0.33 uqdecw x0, #14 +# CHECK-NEXT: 1 4 0.33 uqdecw x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecw x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecw x0, vl1 # CHECK-NEXT: 1 4 0.50 uqdecw z0.s # CHECK-NEXT: 1 4 0.50 uqdecw z0.s, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqdecw z0.s, pow2 # CHECK-NEXT: 1 4 0.50 uqdecw z0.s, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincb w0 -# CHECK-NEXT: 1 1 0.33 uqincb w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincb w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincb w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincb x0 -# CHECK-NEXT: 1 1 0.33 uqincb x0, #14 -# CHECK-NEXT: 1 1 0.33 uqincb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincb x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincb x0, vl1 -# CHECK-NEXT: 1 1 0.33 uqincd w0 -# CHECK-NEXT: 1 1 0.33 uqincd w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincd w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincd w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincd x0 -# CHECK-NEXT: 1 1 0.33 uqincd x0, #14 -# CHECK-NEXT: 1 1 0.33 uqincd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincd x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincd x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqincb w0 +# CHECK-NEXT: 1 4 0.33 uqincb w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincb w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincb w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincb x0 +# CHECK-NEXT: 1 4 0.33 uqincb x0, #14 +# CHECK-NEXT: 1 4 0.33 uqincb x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincb x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincb x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqincd w0 +# CHECK-NEXT: 1 4 0.33 uqincd w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincd w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincd w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincd x0 +# CHECK-NEXT: 1 4 0.33 uqincd x0, #14 +# CHECK-NEXT: 1 4 0.33 uqincd x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincd x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincd x0, vl1 # CHECK-NEXT: 1 4 0.50 uqincd z0.d # CHECK-NEXT: 1 4 0.50 uqincd z0.d, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqincd z0.d, pow2 # CHECK-NEXT: 1 4 0.50 uqincd z0.d, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqinch w0 -# CHECK-NEXT: 1 1 0.33 uqinch w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqinch w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqinch w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqinch x0 -# CHECK-NEXT: 1 1 0.33 uqinch x0, #14 -# CHECK-NEXT: 1 1 0.33 uqinch x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqinch x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqinch x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqinch w0 +# CHECK-NEXT: 1 4 0.33 uqinch w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqinch w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqinch w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqinch x0 +# CHECK-NEXT: 1 4 0.33 uqinch x0, #14 +# CHECK-NEXT: 1 4 0.33 uqinch x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqinch x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqinch x0, vl1 # CHECK-NEXT: 1 4 0.50 uqinch z0.h # CHECK-NEXT: 1 4 0.50 uqinch z0.h, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqinch z0.h, pow2 # CHECK-NEXT: 1 4 0.50 uqinch z0.h, pow2, mul #16 -# CHECK-NEXT: 1 8 1.00 uqincp wzr, p15.b -# CHECK-NEXT: 1 8 1.00 uqincp wzr, p15.d -# CHECK-NEXT: 1 8 1.00 uqincp wzr, p15.h -# CHECK-NEXT: 1 8 1.00 uqincp wzr, p15.s -# CHECK-NEXT: 1 8 1.00 uqincp x0, p0.b -# CHECK-NEXT: 1 8 1.00 uqincp x0, p0.d -# CHECK-NEXT: 1 8 1.00 uqincp x0, p0.h -# CHECK-NEXT: 1 8 1.00 uqincp x0, p0.s +# CHECK-NEXT: 1 9 1.00 uqincp wzr, p15.b +# CHECK-NEXT: 1 9 1.00 uqincp wzr, p15.d +# CHECK-NEXT: 1 9 1.00 uqincp wzr, p15.h +# CHECK-NEXT: 1 9 1.00 uqincp wzr, p15.s +# CHECK-NEXT: 1 9 1.00 uqincp x0, p0.b +# CHECK-NEXT: 1 9 1.00 uqincp x0, p0.d +# CHECK-NEXT: 1 9 1.00 uqincp x0, p0.h +# CHECK-NEXT: 1 9 1.00 uqincp x0, p0.s # CHECK-NEXT: 1 4 0.50 uqincp z0.d, p0.d # CHECK-NEXT: 1 4 0.50 uqincp z0.h, p0.h # CHECK-NEXT: 1 4 0.50 uqincp z0.s, p0.s -# CHECK-NEXT: 1 1 0.33 uqincw w0 -# CHECK-NEXT: 1 1 0.33 uqincw w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincw w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincw w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincw x0 -# CHECK-NEXT: 1 1 0.33 uqincw x0, #14 -# CHECK-NEXT: 1 1 0.33 uqincw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincw x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincw x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqincw w0 +# CHECK-NEXT: 1 4 0.33 uqincw w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincw w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincw w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincw x0 +# CHECK-NEXT: 1 4 0.33 uqincw x0, #14 +# CHECK-NEXT: 1 4 0.33 uqincw x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincw x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincw x0, vl1 # CHECK-NEXT: 1 4 0.50 uqincw z0.s # CHECK-NEXT: 1 4 0.50 uqincw z0.s, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqincw z0.s, pow2 @@ -6723,18 +6723,18 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 usra z31.d, z31.d, #64 # CHECK-NEXT: 1 4 0.50 usra z31.h, z31.h, #16 # CHECK-NEXT: 1 4 0.50 usra z31.s, z31.s, #32 -# CHECK-NEXT: 1 3 0.50 usublb z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 usublb z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 usublb z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 usublt z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 usublt z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 usublt z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 usubwb z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 usubwb z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 usubwb z31.d, z31.d, z31.s -# CHECK-NEXT: 1 3 0.50 usubwt z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 usubwt z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 usubwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 usublb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 usublb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 usublb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 usublt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 usublt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 usublt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 usubwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 usubwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 usubwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 usubwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 usubwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 usubwt z31.d, z31.d, z31.s # CHECK-NEXT: 1 4 0.50 uunpkhi z31.d, z31.s # CHECK-NEXT: 1 4 0.50 uunpkhi z31.h, z31.b # CHECK-NEXT: 1 4 0.50 uunpkhi z31.s, z31.h @@ -6753,82 +6753,82 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 uxth z31.s, p7/m, z31.s # CHECK-NEXT: 1 3 0.50 uxtw z0.d, p0/m, z0.d # CHECK-NEXT: 1 3 0.50 uxtw z31.d, p7/m, z31.d -# CHECK-NEXT: 1 6 1.00 uzp1 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 uzp1 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 uzp1 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 uzp1 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 uzp1 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 uzp1 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 uzp1 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 uzp1 z31.s, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 uzp2 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 uzp2 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 uzp2 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 uzp2 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 uzp2 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 uzp2 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 uzp2 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 uzp2 z31.s, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 whilege p15.b, w0, wzr -# CHECK-NEXT: 1 6 1.00 whilege p15.b, wzr, w0 -# CHECK-NEXT: 1 6 1.00 whilege p15.b, x0, xzr -# CHECK-NEXT: 1 6 1.00 whilege p15.b, xzr, x0 -# CHECK-NEXT: 1 6 1.00 whilege p15.d, w0, wzr -# CHECK-NEXT: 1 6 1.00 whilege p15.d, x0, xzr -# CHECK-NEXT: 1 6 1.00 whilege p15.h, w0, wzr -# CHECK-NEXT: 1 6 1.00 whilege p15.h, x0, xzr -# CHECK-NEXT: 1 6 1.00 whilege p15.s, w0, wzr -# CHECK-NEXT: 1 6 1.00 whilege p15.s, x0, xzr -# CHECK-NEXT: 1 6 1.00 whilerw p15.b, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilerw p15.d, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilerw p15.h, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilerw p15.s, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilewr p15.b, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilewr p15.d, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilewr p15.h, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilewr p15.s, x30, x30 +# CHECK-NEXT: 1 2 1.00 uzp1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 uzp1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 uzp1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 uzp1 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 uzp1 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 uzp1 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 uzp1 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 uzp1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 uzp2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 uzp2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 uzp2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 uzp2 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 uzp2 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 uzp2 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 uzp2 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 uzp2 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 whilege p15.b, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.b, wzr, w0 +# CHECK-NEXT: 1 2 1.00 whilege p15.b, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilege p15.b, xzr, x0 +# CHECK-NEXT: 1 2 1.00 whilege p15.d, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.d, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilege p15.h, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.h, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilege p15.s, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.s, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilerw p15.b, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilerw p15.d, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilerw p15.h, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilerw p15.s, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.b, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.d, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.h, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.s, x30, x30 # CHECK-NEXT: 1 1 0.33 * U wrffr p0.b # CHECK-NEXT: 1 1 0.33 * U wrffr p15.b -# CHECK-NEXT: 1 3 0.50 xar z0.b, z0.b, z1.b, #1 -# CHECK-NEXT: 1 3 0.50 xar z0.d, z0.d, z1.d, #1 -# CHECK-NEXT: 1 3 0.50 xar z0.h, z0.h, z1.h, #1 -# CHECK-NEXT: 1 3 0.50 xar z0.s, z0.s, z1.s, #1 -# CHECK-NEXT: 1 3 0.50 xar z31.b, z31.b, z30.b, #8 -# CHECK-NEXT: 1 3 0.50 xar z31.d, z31.d, z30.d, #64 -# CHECK-NEXT: 1 3 0.50 xar z31.h, z31.h, z30.h, #16 -# CHECK-NEXT: 1 3 0.50 xar z31.s, z31.s, z30.s, #32 -# CHECK-NEXT: 1 6 1.00 zip1 p0.b, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 zip1 p0.d, p0.d, p0.d -# CHECK-NEXT: 1 6 1.00 zip1 p0.h, p0.h, p0.h -# CHECK-NEXT: 1 6 1.00 zip1 p0.s, p0.s, p0.s -# CHECK-NEXT: 1 6 1.00 zip1 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 zip1 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 zip1 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 zip1 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 zip1 z0.b, z0.b, z0.b -# CHECK-NEXT: 1 4 0.50 zip1 z0.d, z0.d, z0.d -# CHECK-NEXT: 1 4 0.50 zip1 z0.h, z0.h, z0.h -# CHECK-NEXT: 1 4 0.50 zip1 z0.s, z0.s, z0.s -# CHECK-NEXT: 1 4 0.50 zip1 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 zip1 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 zip1 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 zip1 z31.s, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 zip2 p0.b, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 zip2 p0.d, p0.d, p0.d -# CHECK-NEXT: 1 6 1.00 zip2 p0.h, p0.h, p0.h -# CHECK-NEXT: 1 6 1.00 zip2 p0.s, p0.s, p0.s -# CHECK-NEXT: 1 6 1.00 zip2 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 zip2 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 zip2 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 zip2 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 zip2 z0.b, z0.b, z0.b -# CHECK-NEXT: 1 4 0.50 zip2 z0.d, z0.d, z0.d -# CHECK-NEXT: 1 4 0.50 zip2 z0.h, z0.h, z0.h -# CHECK-NEXT: 1 4 0.50 zip2 z0.s, z0.s, z0.s -# CHECK-NEXT: 1 4 0.50 zip2 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 zip2 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 zip2 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 zip2 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 xar z0.b, z0.b, z1.b, #1 +# CHECK-NEXT: 1 4 0.50 xar z0.d, z0.d, z1.d, #1 +# CHECK-NEXT: 1 4 0.50 xar z0.h, z0.h, z1.h, #1 +# CHECK-NEXT: 1 4 0.50 xar z0.s, z0.s, z1.s, #1 +# CHECK-NEXT: 1 4 0.50 xar z31.b, z31.b, z30.b, #8 +# CHECK-NEXT: 1 4 0.50 xar z31.d, z31.d, z30.d, #64 +# CHECK-NEXT: 1 4 0.50 xar z31.h, z31.h, z30.h, #16 +# CHECK-NEXT: 1 4 0.50 xar z31.s, z31.s, z30.s, #32 +# CHECK-NEXT: 1 2 1.00 zip1 p0.b, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 zip1 p0.d, p0.d, p0.d +# CHECK-NEXT: 1 2 1.00 zip1 p0.h, p0.h, p0.h +# CHECK-NEXT: 1 2 1.00 zip1 p0.s, p0.s, p0.s +# CHECK-NEXT: 1 2 1.00 zip1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 zip1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 zip1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 zip1 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 zip1 z0.b, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 zip1 z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 zip1 z0.h, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 zip1 z0.s, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 zip1 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 zip1 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 zip1 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 zip1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 zip2 p0.b, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 zip2 p0.d, p0.d, p0.d +# CHECK-NEXT: 1 2 1.00 zip2 p0.h, p0.h, p0.h +# CHECK-NEXT: 1 2 1.00 zip2 p0.s, p0.s, p0.s +# CHECK-NEXT: 1 2 1.00 zip2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 zip2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 zip2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 zip2 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 zip2 z0.b, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 zip2 z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 zip2 z0.h, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 zip2 z0.s, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 zip2 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 zip2 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 zip2 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 zip2 z31.s, z31.s, z31.s # CHECK: Resources: # CHECK-NEXT: [0] - CortexA510UnitALU0 @@ -6848,7 +6848,7 @@ zip2 z31.s, z31.s, z31.s # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] [6] [7] [8] [9] [10.0] [10.1] [11] -# CHECK-NEXT: 79.00 75.00 75.00 9.00 - 240.00 3698.00 - - 1290.00 924.00 199.50 199.50 670.00 +# CHECK-NEXT: 79.00 75.00 75.00 9.00 - 209.00 3667.00 - - 1290.00 924.00 199.50 199.50 670.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] [6] [7] [8] [9] [10.0] [10.1] [11] Instructions: @@ -7844,8 +7844,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1b { z0.d }, p0/z, [x0] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1b { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1b { z0.h }, p0/z, [x0] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1b { z0.s }, p0/z, [x0] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1b { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1b { z21.b }, p5/z, [x10, #5, mul vl] @@ -7882,8 +7882,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z0.d }, p0/z, [x0] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1h { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z0.h }, p0/z, [x0] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z0.s }, p0/z, [x0] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1h { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z21.d }, p5/z, [x10, #5, mul vl] @@ -7899,8 +7899,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1h { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z31.h }, p7/z, [sp, #-1, mul vl] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z5.h }, p3/z, [sp, x16, lsl #1] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z5.h }, p3/z, [x17, x16, lsl #1] @@ -7961,7 +7961,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sb { z0.h }, p0/z, [sp, x0] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sb { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sb { z0.h }, p0/z, [x0] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sb { z0.s }, p0/z, [x0] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1sb { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sb { z21.d }, p5/z, [x10, #5, mul vl] @@ -7981,8 +7981,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sh { z0.d }, p0/z, [x0] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sh { z0.d }, p0/z, [z0.d] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sh { z0.s }, p0/z, [x0] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1sh { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sh { z21.d }, p5/z, [x10, #5, mul vl] @@ -7997,8 +7997,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sh { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sh { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sh { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1sh { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] @@ -8017,8 +8017,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1w { z0.d }, p0/z, [x0] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1w { z0.d }, p0/z, [z0.d] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1w { z0.s }, p0/z, [x0] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1w { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1w { z21.d }, p5/z, [x10, #5, mul vl] @@ -8033,8 +8033,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1w { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1w { z31.d }, p7/z, [z31.d, #124] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1w { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1w { z31.s }, p7/z, [z31.s, #124] # CHECK-NEXT: - - - - - - 2.00 - - - - - - - ld2b { z0.b, z1.b }, p0/z, [x0, x0] # CHECK-NEXT: - - - - - - 1.00 - - - - - - - ld2b { z0.b, z1.b }, p0/z, [x0] @@ -8100,8 +8100,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1b { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1b { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1b { z0.s }, p0/z, [x0, x0] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1b { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1b { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1b { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -8128,8 +8128,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1h { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1h { z0.h }, p0/z, [x0, x0, lsl #1] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1h { z0.s }, p0/z, [x0, x0, lsl #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1h { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1h { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1h { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -8138,16 +8138,16 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1h { z31.d }, p7/z, [sp] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1h { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1h { z31.h }, p7/z, [sp] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1h { z31.s }, p7/z, [sp] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sb { z0.d }, p0/z, [x0, x0] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sb { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sb { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, x0] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1sb { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sb { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sb { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -8162,8 +8162,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, x0, lsl #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1sh { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -8171,8 +8171,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sh { z31.d }, p7/z, [sp] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z31.d }, p7/z, [z31.d, #62] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sh { z31.s }, p7/z, [sp] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1sh { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sw { z0.d }, p0/z, [x0, x0, lsl #2] @@ -8190,8 +8190,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1w { z0.s }, p0/z, [x0, x0, lsl #2] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1w { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -8199,8 +8199,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1w { z31.d }, p7/z, [sp] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z31.d }, p7/z, [z31.d, #124] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1w { z31.s }, p7/z, [sp] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1w { z31.s }, p7/z, [z31.s, #124] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldnf1b { z0.b }, p0/z, [x0] From 4572a2db2c2c48820e79fb65f3810348371db4f1 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 17 Apr 2024 19:44:17 +0100 Subject: [PATCH 293/300] [AArch64] Add some test cases for LD2/LD3/LD4 shuffles. NFC --- .../CostModel/AArch64/shuffle-load.ll | 620 ++++++++++++++++++ 1 file changed, 620 insertions(+) diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll index ec95a313e11e20..106f2f9edc2e43 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll @@ -401,3 +401,623 @@ entry: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %lane } + +define void @vld2(ptr %p) { +; CHECK-LABEL: 'vld2' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <4 x i8>, ptr %p, align 4 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = load <8 x i8>, ptr %p, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = load <16 x i8>, ptr %p, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = load <32 x i8>, ptr %p, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = load <4 x i16>, ptr %p, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = load <8 x i16>, ptr %p, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = load <16 x i16>, ptr %p, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i16 = load <32 x i16>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = load <4 x i32>, ptr %p, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = load <8 x i32>, ptr %p, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = load <16 x i32>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i32 = load <32 x i32>, ptr %p, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64 = load <4 x i64>, ptr %p, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i64 = load <8 x i64>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i64 = load <16 x i64>, ptr %p, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i64 = load <32 x i64>, ptr %p, align 256 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CODESIZE-LABEL: 'vld2' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = load <4 x i8>, ptr %p, align 4 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = load <8 x i8>, ptr %p, align 8 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = load <16 x i8>, ptr %p, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = load <32 x i8>, ptr %p, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = load <4 x i16>, ptr %p, align 8 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = load <8 x i16>, ptr %p, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = load <16 x i16>, ptr %p, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i16 = load <32 x i16>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = load <4 x i32>, ptr %p, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = load <8 x i32>, ptr %p, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = load <16 x i32>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i32 = load <32 x i32>, ptr %p, align 128 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i64 = load <4 x i64>, ptr %p, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i64 = load <8 x i64>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i64 = load <16 x i64>, ptr %p, align 128 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i64 = load <32 x i64>, ptr %p, align 256 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %v4i8 = load <4 x i8>, ptr %p + %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> + %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> + %v8i8 = load <8 x i8>, ptr %p + %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> + %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> + %v16i8 = load <16 x i8>, ptr %p + %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> + %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> + %v32i8 = load <32 x i8>, ptr %p + %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> + %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> + + %v4i16 = load <4 x i16>, ptr %p + %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> + %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> + %v8i16 = load <8 x i16>, ptr %p + %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> + %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> + %v16i16 = load <16 x i16>, ptr %p + %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> + %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> + %v32i16 = load <32 x i16>, ptr %p + %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> + %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> + + %v4i32 = load <4 x i32>, ptr %p + %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> + %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> + %v8i32 = load <8 x i32>, ptr %p + %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> + %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> + %v16i32 = load <16 x i32>, ptr %p + %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> + %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> + %v32i32 = load <32 x i32>, ptr %p + %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> + %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> + + %v2i64 = load <4 x i64>, ptr %p + %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> + %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> + %v4i64 = load <8 x i64>, ptr %p + %v4i64_0 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> + %v4i64_1 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> + %v8i64 = load <16 x i64>, ptr %p + %v8i64_0 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> + %v8i64_1 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> + %v16i64 = load <32 x i64>, ptr %p + %v16i64_0 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> + %v16i64_1 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> + + ret void +} + + +define void @vld3(ptr %p) { +; CHECK-LABEL: 'vld3' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = load <6 x i8>, ptr %p, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_1 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_2 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = load <12 x i8>, ptr %p, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_0 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <24 x i8>, ptr %p, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <48 x i8>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <6 x i16>, ptr %p, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <12 x i16>, ptr %p, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <24 x i16>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <48 x i16>, ptr %p, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <6 x i32>, ptr %p, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <12 x i32>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <24 x i32>, ptr %p, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <48 x i32>, ptr %p, align 256 +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <6 x i64>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_2 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = load <12 x i64>, ptr %p, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_2 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i64 = load <24 x i64>, ptr %p, align 256 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_2 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i64 = load <48 x i64>, ptr %p, align 512 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_2 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CODESIZE-LABEL: 'vld3' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = load <6 x i8>, ptr %p, align 8 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_1 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_2 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = load <12 x i8>, ptr %p, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_0 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <24 x i8>, ptr %p, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <48 x i8>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <6 x i16>, ptr %p, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <12 x i16>, ptr %p, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <24 x i16>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <48 x i16>, ptr %p, align 128 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <6 x i32>, ptr %p, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <12 x i32>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <24 x i32>, ptr %p, align 128 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <48 x i32>, ptr %p, align 256 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <6 x i64>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_2 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = load <12 x i64>, ptr %p, align 128 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_2 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i64 = load <24 x i64>, ptr %p, align 256 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_2 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i64 = load <48 x i64>, ptr %p, align 512 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_2 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %v2i8 = load <6 x i8>, ptr %p + %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> + %v2i8_1 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> + %v2i8_2 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> + %v4i8 = load <12 x i8>, ptr %p + %v4i8_0 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> + %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> + %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> + %v8i8 = load <24 x i8>, ptr %p + %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> + %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> + %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> + %v16i8 = load <48 x i8>, ptr %p + %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> + %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> + %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> + + %v2i16 = load <6 x i16>, ptr %p + %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> + %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> + %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> + %v4i16 = load <12 x i16>, ptr %p + %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> + %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> + %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> + %v8i16 = load <24 x i16>, ptr %p + %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> + %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> + %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> + %v16i16 = load <48 x i16>, ptr %p + %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> + %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> + %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> + + %v2i32 = load <6 x i32>, ptr %p + %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> + %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> + %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> + %v4i32 = load <12 x i32>, ptr %p + %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> + %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> + %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> + %v8i32 = load <24 x i32>, ptr %p + %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> + %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> + %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> + %v16i32 = load <48 x i32>, ptr %p + %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> + %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> + %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> + + %v2i64 = load <6 x i64>, ptr %p + %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> + %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> + %v2i64_2 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> + %v4i64 = load <12 x i64>, ptr %p + %v4i64_0 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> + %v4i64_1 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> + %v4i64_2 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> + %v8i64 = load <24 x i64>, ptr %p + %v8i64_0 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> + %v8i64_1 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> + %v8i64_2 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> + %v16i64 = load <48 x i64>, ptr %p + %v16i64_0 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> + %v16i64_1 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> + %v16i64_2 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> + + ret void +} + +define void @vld4(ptr %p) { +; CHECK-LABEL: 'vld4' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = load <8 x i8>, ptr %p, align 8 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = load <16 x i8>, ptr %p, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 16 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256 +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = load <16 x i64>, ptr %p, align 128 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i64 = load <32 x i64>, ptr %p, align 256 +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i64 = load <64 x i64>, ptr %p, align 512 +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; CODESIZE-LABEL: 'vld4' +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = load <8 x i8>, ptr %p, align 8 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = load <16 x i8>, ptr %p, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 16 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = load <16 x i64>, ptr %p, align 128 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8i64 = load <32 x i64>, ptr %p, align 256 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16i64 = load <64 x i64>, ptr %p, align 512 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %v2i8 = load <8 x i8>, ptr %p + %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> + %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> + %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> + %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> + %v4i8 = load <16 x i8>, ptr %p + %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> + %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> + %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> + %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> + %v8i8 = load <32 x i8>, ptr %p + %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> + %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> + %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> + %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> + %v16i8 = load <64 x i8>, ptr %p + %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> + %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> + %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> + %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> + + %v2i16 = load <8 x i16>, ptr %p + %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> + %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> + %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> + %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> + %v4i16 = load <16 x i16>, ptr %p + %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> + %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> + %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> + %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> + %v8i16 = load <32 x i16>, ptr %p + %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> + %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> + %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> + %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> + %v16i16 = load <64 x i16>, ptr %p + %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> + %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> + %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> + %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> + + %v2i32 = load <8 x i32>, ptr %p + %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> + %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> + %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> + %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> + %v4i32 = load <16 x i32>, ptr %p + %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> + %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> + %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> + %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> + %v8i32 = load <32 x i32>, ptr %p + %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> + %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> + %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> + %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> + %v16i32 = load <64 x i32>, ptr %p + %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> + %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> + %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> + %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> + + %v2i64 = load <8 x i64>, ptr %p + %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> + %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> + %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> + %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> + %v4i64 = load <16 x i64>, ptr %p + %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> + %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> + %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> + %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> + %v8i64 = load <32 x i64>, ptr %p + %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> + %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> + %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> + %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> + %v16i64 = load <64 x i64>, ptr %p + %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> + %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> + %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> + %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> + + ret void +} + From 5a0942cd7423069e78fdfb9743a13aedfa7bdee0 Mon Sep 17 00:00:00 2001 From: Youngsuk Kim Date: Wed, 17 Apr 2024 14:45:33 -0400 Subject: [PATCH 294/300] [llvm][NVPTX] Don't emit unused var 'temp_param_reg' (NFC) (#89004) Don't emit unused variable 'temp_param_reg' which has been around since ae556d3ef72dfe5f40a337b7071f42b7bf5b66a4 . --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 3 +-- llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll | 2 -- llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 3 --- llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll | 1 - .../update_llc_test_checks/Inputs/nvptx-basic.ll.expected | 1 - 5 files changed, 1 insertion(+), 9 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4daf47cfd4df8a..cd8546005c0289 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3785,8 +3785,7 @@ class Pseudo pattern> def Callseq_Start : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), - "\\{ // callseq $amt1, $amt2\n" - "\t.reg .b32 temp_param_reg;", + "\\{ // callseq $amt1, $amt2", [(callseq_start timm:$amt1, timm:$amt2)]>; def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll index 09297fb819ce55..ce81957f2a3934 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll @@ -17,7 +17,6 @@ ; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16; ; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]]; ; CHECK-32-NEXT: { // callseq 0, 0 -; CHECK-32-NEXT: .reg .b32 temp_param_reg; ; CHECK-32-NEXT: .param .b32 param0; ; CHECK-32-NEXT: st.param.b32 [param0+0], %r[[ALLOCA]]; @@ -27,7 +26,6 @@ ; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16; ; CHECK-64-NEXT: cvta.local.u64 %rd[[ALLOCA]], %rd[[ALLOCA]]; ; CHECK-64-NEXT: { // callseq 0, 0 -; CHECK-64-NEXT: .reg .b32 temp_param_reg; ; CHECK-64-NEXT: .param .b64 param0; ; CHECK-64-NEXT: st.param.b64 [param0+0], %rd[[ALLOCA]]; diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 6895699a1dfea1..96a4359d0ec43e 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -827,7 +827,6 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: ld.param.u32 %r2, [test_call_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_call_param_0]; ; CHECK-NEXT: { // callseq 0, 0 -; CHECK-NEXT: .reg .b32 temp_param_reg; ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.b32 [param0+0], %r1; ; CHECK-NEXT: .param .align 4 .b8 param1[4]; @@ -856,7 +855,6 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0]; ; CHECK-NEXT: { // callseq 1, 0 -; CHECK-NEXT: .reg .b32 temp_param_reg; ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.b32 [param0+0], %r2; ; CHECK-NEXT: .param .align 4 .b8 param1[4]; @@ -885,7 +883,6 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0]; ; CHECK-NEXT: { // callseq 2, 0 -; CHECK-NEXT: .reg .b32 temp_param_reg; ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.b32 [param0+0], %r2; ; CHECK-NEXT: .param .align 4 .b8 param1[4]; diff --git a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll index 8d7608ead38edb..de367dfa4acb4c 100644 --- a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll +++ b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll @@ -9,7 +9,6 @@ ; CHECK: add.u64 %rd1, %SP, 0; ; CHECK: .loc 1 5 3 // t.c:5:3 ; CHECK: { // callseq 0, 0 -; CHECK: .reg .b32 temp_param_reg; ; CHECK: .param .b64 param0; ; CHECK: st.param.b64 [param0+0], %rd1; ; CHECK: call.uni diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected index a65f140b460159..3ac63d070933dd 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected @@ -30,7 +30,6 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ld.u64 %rd7, [%SP+24]; ; CHECK-NEXT: ld.u64 %rd8, [%SP+16]; ; CHECK-NEXT: { // callseq 0, 0 -; CHECK-NEXT: .reg .b32 temp_param_reg; ; CHECK-NEXT: .param .align 16 .b8 param0[32]; ; CHECK-NEXT: st.param.v2.b64 [param0+0], {%rd6, %rd5}; ; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd8, %rd7}; From 800f1050e190430f217e1fd0db9414dacc835e11 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 17 Apr 2024 14:53:01 -0400 Subject: [PATCH 295/300] [GitHub] Add a new mapping for `offload` subproject (#89118) Fix #89071. --- .github/new-prs-labeler.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index 1502d64a7d3e3e..9cf64417d3cb2c 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -944,3 +944,6 @@ openmp:libomptarget: bazel: - utils/bazel/** + +offload: + - offload/** From 6f7976c883da592f2cf6bfadef152e4203c00445 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Wed, 17 Apr 2024 20:55:51 +0200 Subject: [PATCH 296/300] [libc++][TZDB] Adds sys_info formatter. (#85896) Implements parts of: - P0355 Extending to Calendars and Time Zones - P1361 Integration of chrono with text formatting --- libcxx/docs/Status/FormatPaper.csv | 2 +- libcxx/include/__chrono/convert_to_tm.h | 5 + libcxx/include/__chrono/formatter.h | 45 +++++- libcxx/include/__chrono/ostream.h | 20 +++ libcxx/include/__chrono/sys_info.h | 2 +- libcxx/include/chrono | 5 + .../time.zone.info.sys/ostream.pass.cpp | 74 ++++++++++ .../time/time.syn/formatter.sys_info.pass.cpp | 137 ++++++++++++++++++ .../time.zone.info.sys/ostream.pass.cpp | 52 +++++++ .../concept.formattable.compile.pass.cpp | 4 +- libcxx/test/support/test_macros.h | 8 + 11 files changed, 347 insertions(+), 7 deletions(-) create mode 100644 libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp create mode 100644 libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp create mode 100644 libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp diff --git a/libcxx/docs/Status/FormatPaper.csv b/libcxx/docs/Status/FormatPaper.csv index e9d407e79e2539..8ace18815f5375 100644 --- a/libcxx/docs/Status/FormatPaper.csv +++ b/libcxx/docs/Status/FormatPaper.csv @@ -24,7 +24,7 @@ Section,Description,Dependencies,Assignee,Status,First released version `[time.syn] `_,"Formatter ``chrono::year_month_weekday``",,Mark de Wever,|Complete|,16.0 `[time.syn] `_,"Formatter ``chrono::year_month_weekday_last``",,Mark de Wever,|Complete|,16.0 `[time.syn] `_,"Formatter ``chrono::hh_mm_ss>``",,Mark de Wever,|Complete|,17.0 -`[time.syn] `_,"Formatter ``chrono::sys_info``",A ```` implementation,Mark de Wever,, +`[time.syn] `_,"Formatter ``chrono::sys_info``",,Mark de Wever,|Complete|,19.0 `[time.syn] `_,"Formatter ``chrono::local_info``",A ```` implementation,Mark de Wever,, `[time.syn] `_,"Formatter ``chrono::zoned_time``",A ```` implementation,Mark de Wever,, diff --git a/libcxx/include/__chrono/convert_to_tm.h b/libcxx/include/__chrono/convert_to_tm.h index 1301cd6f1f1ada..d2c5cf922ba671 100644 --- a/libcxx/include/__chrono/convert_to_tm.h +++ b/libcxx/include/__chrono/convert_to_tm.h @@ -20,6 +20,7 @@ #include <__chrono/month_weekday.h> #include <__chrono/monthday.h> #include <__chrono/statically_widen.h> +#include <__chrono/sys_info.h> #include <__chrono/system_clock.h> #include <__chrono/time_point.h> #include <__chrono/weekday.h> @@ -171,6 +172,10 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const _ChronoT& __value) { if (__value.hours().count() > std::numeric_limits::max()) std::__throw_format_error("Formatting hh_mm_ss, encountered an hour overflow"); __result.tm_hour = __value.hours().count(); +# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) + } else if constexpr (same_as<_ChronoT, chrono::sys_info>) { + // Has no time information. +# endif } else static_assert(sizeof(_ChronoT) == 0, "Add the missing type specialization"); diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h index f76e7b2ea0e864..0196deb8c1ffb3 100644 --- a/libcxx/include/__chrono/formatter.h +++ b/libcxx/include/__chrono/formatter.h @@ -24,6 +24,7 @@ #include <__chrono/ostream.h> #include <__chrono/parser_std_format_spec.h> #include <__chrono/statically_widen.h> +#include <__chrono/sys_info.h> #include <__chrono/system_clock.h> #include <__chrono/time_point.h> #include <__chrono/weekday.h> @@ -186,10 +187,11 @@ __format_zone_offset(basic_stringstream<_CharT>& __sstr, chrono::seconds __offse chrono::hh_mm_ss __hms{__offset}; std::ostreambuf_iterator<_CharT> __out_it{__sstr}; + // Note HMS does not allow formatting hours > 23, but the offset is not limited to 24H. + std::format_to(__out_it, _LIBCPP_STATICALLY_WIDEN(_CharT, "{:02}"), __hms.hours().count()); if (__modifier) - std::format_to(__out_it, _LIBCPP_STATICALLY_WIDEN(_CharT, "{:%H:%M}"), __hms); - else - std::format_to(__out_it, _LIBCPP_STATICALLY_WIDEN(_CharT, "{:%H%M}"), __hms); + __sstr << _CharT(':'); + std::format_to(__out_it, _LIBCPP_STATICALLY_WIDEN(_CharT, "{:02}"), __hms.minutes().count()); } // Helper to store the time zone information needed for formatting. @@ -202,7 +204,12 @@ struct _LIBCPP_HIDE_FROM_ABI __time_zone { template _LIBCPP_HIDE_FROM_ABI __time_zone __convert_to_time_zone([[maybe_unused]] const _Tp& __value) { - return {"UTC", chrono::seconds{0}}; +# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) + if constexpr (same_as<_Tp, chrono::sys_info>) + return {__value.abbrev, __value.offset}; + else +# endif + return {"UTC", chrono::seconds{0}}; } template @@ -409,6 +416,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __weekday_ok(const _Tp& __value) { return __value.weekday().ok(); else if constexpr (__is_hh_mm_ss<_Tp>) return true; +# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) + else if constexpr (same_as<_Tp, chrono::sys_info>) + return true; +# endif else static_assert(sizeof(_Tp) == 0, "Add the missing type specialization"); } @@ -449,6 +460,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __weekday_name_ok(const _Tp& __value) { return __value.weekday().ok(); else if constexpr (__is_hh_mm_ss<_Tp>) return true; +# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) + else if constexpr (same_as<_Tp, chrono::sys_info>) + return true; +# endif else static_assert(sizeof(_Tp) == 0, "Add the missing type specialization"); } @@ -489,6 +504,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __date_ok(const _Tp& __value) { return __value.ok(); else if constexpr (__is_hh_mm_ss<_Tp>) return true; +# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) + else if constexpr (same_as<_Tp, chrono::sys_info>) + return true; +# endif else static_assert(sizeof(_Tp) == 0, "Add the missing type specialization"); } @@ -529,6 +548,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __month_name_ok(const _Tp& __value) { return __value.month().ok(); else if constexpr (__is_hh_mm_ss<_Tp>) return true; +# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) + else if constexpr (same_as<_Tp, chrono::sys_info>) + return true; +# endif else static_assert(sizeof(_Tp) == 0, "Add the missing type specialization"); } @@ -858,6 +881,20 @@ struct formatter, _CharT> : public __formatter_chron return _Base::__parse(__ctx, __format_spec::__fields_chrono, __format_spec::__flags::__time); } }; + +# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +template <__fmt_char_type _CharT> +struct formatter : public __formatter_chrono<_CharT> { +public: + using _Base = __formatter_chrono<_CharT>; + + template + _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) { + return _Base::__parse(__ctx, __format_spec::__fields_chrono, __format_spec::__flags::__time_zone); + } +}; +# endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) + #endif // if _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__chrono/ostream.h b/libcxx/include/__chrono/ostream.h index b687ef8059d5f5..9476406d761075 100644 --- a/libcxx/include/__chrono/ostream.h +++ b/libcxx/include/__chrono/ostream.h @@ -19,6 +19,7 @@ #include <__chrono/month_weekday.h> #include <__chrono/monthday.h> #include <__chrono/statically_widen.h> +#include <__chrono/sys_info.h> #include <__chrono/system_clock.h> #include <__chrono/weekday.h> #include <__chrono/year.h> @@ -262,6 +263,25 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const hh_mm_ss<_Duration> __hms return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%T}"), __hms); } +# if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) + +template +_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>& +operator<<(basic_ostream<_CharT, _Traits>& __os, const sys_info& __info) { + // __info.abbrev is always std::basic_string. + // Since these strings typically are short the conversion should be cheap. + std::basic_string<_CharT> __abbrev{__info.abbrev.begin(), __info.abbrev.end()}; + return __os << std::format( + _LIBCPP_STATICALLY_WIDEN(_CharT, "[{:%F %T}, {:%F %T}) {:%T} {:%Q%q} \"{}\""), + __info.begin, + __info.end, + hh_mm_ss{__info.offset}, + __info.save, + __abbrev); +} + +# endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) + } // namespace chrono #endif // if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__chrono/sys_info.h b/libcxx/include/__chrono/sys_info.h index 794d22f2ccc1ef..461d5322d413b3 100644 --- a/libcxx/include/__chrono/sys_info.h +++ b/libcxx/include/__chrono/sys_info.h @@ -42,7 +42,7 @@ struct sys_info { } // namespace chrono -# endif //_LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/chrono b/libcxx/include/chrono index 513ae52006e890..f0275ec1eba7ae 100644 --- a/libcxx/include/chrono +++ b/libcxx/include/chrono @@ -733,6 +733,10 @@ struct sys_info { string abbrev; }; +template // C++20 + basic_ostream& + operator<<(basic_ostream& os, const sys_info& si); + // 25.10.5, class time_zone // C++20 enum class choose {earliest, latest}; class time_zone { @@ -829,6 +833,7 @@ namespace std { template struct formatter; // C++20 template struct formatter>, charT>; // C++20 + template struct formatter; // C++20 } // namespace std namespace chrono { diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp new file mode 100644 index 00000000000000..faa7d855c8de7d --- /dev/null +++ b/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-localization + +// TODO FMT This test should not require std::to_chars(floating-point) +// XFAIL: availability-fp_to_chars-missing + +// XFAIL: libcpp-has-no-incomplete-tzdb + +// + +// template +// basic_ostream& +// operator<<(basic_ostream& os, const sys_info& r); + +// [time.zone.info.sys] +// 7 Effects: Streams out the sys_info object r in an unspecified format. +// 8 Returns: os. +// +// Tests the output produced by this function. + +#include +#include +#include +#include + +#include "assert_macros.h" +#include "test_macros.h" +#include "make_string.h" +#include "concat_macros.h" + +#define SV(S) MAKE_STRING_VIEW(CharT, S) + +template +static void test(std::basic_string_view expected, std::chrono::sys_info&& info) { + std::basic_stringstream sstr; + sstr << info; + std::basic_string output = sstr.str(); + + TEST_REQUIRE(expected == output, + TEST_WRITE_CONCATENATED("\nExpected output ", expected, "\nActual output ", output, '\n')); +} + +template +static void test() { + using namespace std::literals::chrono_literals; + namespace tz = std::chrono; + + test(SV("[-10484-10-16 15:30:08, 14423-03-17 15:30:07) 00:00:00 0min \"TZ\""), + tz::sys_info{tz::sys_seconds::min(), tz::sys_seconds::max(), 0s, 0min, "TZ"}); + + test(SV("[1970-01-01 00:00:00, 2038-12-31 00:00:00) 12:23:45 -67min \"DMY\""), + tz::sys_info{static_cast(tz::year_month_day{1970y, tz::January, 1d}), + static_cast(tz::year_month_day{2038y, tz::December, 31d}), + 12h + 23min + 45s, + -67min, + "DMY"}); +} + +int main(int, const char**) { + test(); +#ifndef TEST_HAS_NO_WIDE_CHARACTERS + test(); +#endif + + return 0; +} diff --git a/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp new file mode 100644 index 00000000000000..0a41424cfdcb83 --- /dev/null +++ b/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp @@ -0,0 +1,137 @@ +//===----------------------------------------------------------------------===// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-localization + +// TODO FMT This test should not require std::to_chars(floating-point) +// XFAIL: availability-fp_to_chars-missing + +// XFAIL: libcpp-has-no-incomplete-tzdb + +// REQUIRES: locale.fr_FR.UTF-8 +// REQUIRES: locale.ja_JP.UTF-8 + +// +// +// template struct formatter; + +#include +#include + +#include +#include +#include +#include +#include + +#include "formatter_tests.h" +#include "make_string.h" +#include "platform_support.h" // locale name macros +#include "test_macros.h" + +template +static void test_no_chrono_specs() { +// This test libc++ specific due to +// [time.zone.info.sys]/7 +// Effects: Streams out the sys_info object r in an unspecified format. +#ifdef _LIBCPP_VERSION + using namespace std::literals::chrono_literals; + namespace tz = std::chrono; + + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output + + check(SV("[-10484-10-16 15:30:08, 14423-03-17 15:30:07) 00:00:00 0min \"TZ\""), + SV("{}"), + tz::sys_info{tz::sys_seconds::min(), tz::sys_seconds::max(), 0s, 0min, "TZ"}); + + check(SV("[1970-01-01 00:00:00, 2038-12-31 00:00:00) 12:23:45 -67min \"DMY\""), + SV("{}"), + tz::sys_info{static_cast(tz::year_month_day{1970y, tz::January, 1d}), + static_cast(tz::year_month_day{2038y, tz::December, 31d}), + 12h + 23min + 45s, + -67min, + "DMY"}); + + std::locale::global(std::locale::classic()); +#endif // _LIBCPP_VERSION +} + +template +static void test_valid_values() { + using namespace std::literals::chrono_literals; + + constexpr std::basic_string_view fmt = SV("{:%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}"); + constexpr std::basic_string_view lfmt = SV("{:L%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}"); + + const std::locale loc(LOCALE_ja_JP_UTF_8); + std::locale::global(std::locale(LOCALE_fr_FR_UTF_8)); + + // Non localized output using C-locale + check(SV("%z='-0200'\t%Ez='-02:00'\t%Oz='-02:00'\t%Z='NEG'\n"), + fmt, + std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, -2h, 0min, "NEG"}); + + check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='ZERO'\n"), + fmt, + std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 0s, 0min, "ZERO"}); + + check(SV("%z='+1115'\t%Ez='+11:15'\t%Oz='+11:15'\t%Z='POS'\n"), + fmt, + std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 11h + 15min, 0min, "POS"}); + + // Use the global locale (fr_FR) + check(SV("%z='-0200'\t%Ez='-02:00'\t%Oz='-02:00'\t%Z='NEG'\n"), + lfmt, + std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, -2h, 0min, "NEG"}); + + check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='ZERO'\n"), + lfmt, + std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 0s, 0min, "ZERO"}); + + check(SV("%z='+1115'\t%Ez='+11:15'\t%Oz='+11:15'\t%Z='POS'\n"), + lfmt, + std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 11h + 15min, 0min, "POS"}); + + // Use supplied locale (ja_JP). + check(loc, + SV("%z='-0200'\t%Ez='-02:00'\t%Oz='-02:00'\t%Z='NEG'\n"), + lfmt, + std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, -2h, 0min, "NEG"}); + + check(loc, + SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='ZERO'\n"), + lfmt, + std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 0s, 0min, "ZERO"}); + + check(loc, + SV("%z='+1115'\t%Ez='+11:15'\t%Oz='+11:15'\t%Z='POS'\n"), + lfmt, + std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 11h + 15min, 0min, "POS"}); + + std::locale::global(std::locale::classic()); +} + +template +static void test() { + test_no_chrono_specs(); + test_valid_values(); + + check_invalid_types({SV("z"), SV("Z"), SV("Ez"), SV("Oz")}, std::chrono::sys_info{}); +} + +int main(int, char**) { + test(); + +#ifndef TEST_HAS_NO_WIDE_CHARACTERS + test(); +#endif + + return 0; +} diff --git a/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp new file mode 100644 index 00000000000000..82c4844b423c5a --- /dev/null +++ b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 +// UNSUPPORTED: no-localization + +// TODO FMT This test should not require std::to_chars(floating-point) +// XFAIL: availability-fp_to_chars-missing + +// XFAIL: libcpp-has-no-incomplete-tzdb + +// + +// template +// basic_ostream& +// operator<<(basic_ostream& os, const sys_info& r); + +// [time.zone.info.sys] +// 7 Effects: Streams out the sys_info object r in an unspecified format. +// 8 Returns: os. +// +// There is a private libc++ test that validates the exact output. + +#include +#include +#include +#include + +#include "test_macros.h" + +template +static void test() { + using namespace std::literals::chrono_literals; + std::chrono::sys_info s{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 0h, 0min, ""}; + std::basic_ostringstream os; + std::basic_ostream& result = std::chrono::operator<<(os, s); + assert(std::addressof(result) == std::addressof(os)); +} + +int main(int, const char**) { + test(); +#ifndef TEST_HAS_NO_WIDE_CHARACTERS + test(); +#endif + + return 0; +} diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp index fad5277fd6e110..f40784ec446fae 100644 --- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp +++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp @@ -176,10 +176,12 @@ void test_P1361() { assert_is_formattable, CharT>(); - //assert_is_formattable(); +# if !defined(TEST_HAS_NO_INCOMPLETE_TZDB) + assert_is_formattable(); //assert_is_formattable(); //assert_is_formattable(); +# endif // !defined(TEST_HAS_NO_INCOMPLETE_TZDB) #endif // TEST_HAS_NO_LOCALIZATION } diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index 7b2dcbb52d0c88..fe9207d7e59690 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -417,6 +417,14 @@ inline Tp const& DoNotOptimize(Tp const& value) { # define TEST_HAS_NO_RANDOM_DEVICE #endif +#if defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB) +# define TEST_HAS_NO_INCOMPLETE_TZDB +#endif + +#if defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE) +# define TEST_HAS_NO_TIME_ZONE_DATABASE +#endif + #if defined(TEST_COMPILER_CLANG) # define TEST_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") # define TEST_DIAGNOSTIC_POP _Pragma("clang diagnostic pop") From b1dc62f139ef265a36a2a739ce9ba4e1e48a6dbe Mon Sep 17 00:00:00 2001 From: Amirreza Ashouri Date: Wed, 17 Apr 2024 22:28:11 +0330 Subject: [PATCH 297/300] [clang]Treat arguments to builtin type traits as template type arguments (#87132) This change improves error messages for builtins in case of empty parentheses. Fixes llvm#86997 --- clang/lib/Parse/ParseExprCXX.cpp | 12 ++++++------ clang/test/Sema/static-assert.c | 9 ++++----- clang/test/SemaCXX/builtins.cpp | 5 +++++ clang/test/SemaCXX/deprecated-builtins.cpp | 5 +++++ 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp index 43d6105dcf31c4..0d2ad980696fcc 100644 --- a/clang/lib/Parse/ParseExprCXX.cpp +++ b/clang/lib/Parse/ParseExprCXX.cpp @@ -3910,10 +3910,10 @@ ExprResult Parser::ParseTypeTrait() { SmallVector Args; do { // Parse the next type. - TypeResult Ty = - ParseTypeName(/*SourceRange=*/nullptr, - getLangOpts().CPlusPlus ? DeclaratorContext::TemplateArg - : DeclaratorContext::TypeName); + TypeResult Ty = ParseTypeName(/*SourceRange=*/nullptr, + getLangOpts().CPlusPlus + ? DeclaratorContext::TemplateTypeArg + : DeclaratorContext::TypeName); if (Ty.isInvalid()) { Parens.skipToEnd(); return ExprError(); @@ -3955,8 +3955,8 @@ ExprResult Parser::ParseArrayTypeTrait() { if (T.expectAndConsume()) return ExprError(); - TypeResult Ty = - ParseTypeName(/*SourceRange=*/nullptr, DeclaratorContext::TemplateArg); + TypeResult Ty = ParseTypeName(/*SourceRange=*/nullptr, + DeclaratorContext::TemplateTypeArg); if (Ty.isInvalid()) { SkipUntil(tok::comma, StopAtSemi); SkipUntil(tok::r_paren, StopAtSemi); diff --git a/clang/test/Sema/static-assert.c b/clang/test/Sema/static-assert.c index ae5e8076e0beda..4e9e6b7ee558bd 100644 --- a/clang/test/Sema/static-assert.c +++ b/clang/test/Sema/static-assert.c @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 -std=c11 -Wgnu-folding-constant -fsyntax-only -verify=expected,c %s -// RUN: %clang_cc1 -fms-compatibility -Wgnu-folding-constant -DMS -fsyntax-only -verify=expected,ms,c %s -// RUN: %clang_cc1 -std=c99 -pedantic -Wgnu-folding-constant -fsyntax-only -verify=expected,ext,c %s +// RUN: %clang_cc1 -std=c11 -Wgnu-folding-constant -fsyntax-only -verify %s +// RUN: %clang_cc1 -fms-compatibility -Wgnu-folding-constant -DMS -fsyntax-only -verify=expected,ms %s +// RUN: %clang_cc1 -std=c99 -pedantic -Wgnu-folding-constant -fsyntax-only -verify=expected,ext %s // RUN: %clang_cc1 -xc++ -std=c++11 -pedantic -fsyntax-only -verify=expected,ext,cxx %s _Static_assert("foo", "string is nonzero"); // ext-warning {{'_Static_assert' is a C11 extension}} @@ -57,8 +57,7 @@ UNION(char[2], short) u2 = { .one = { 'a', 'b' } }; // ext-warning 3 {{'_Static_ typedef UNION(char, short) U3; // expected-error {{static assertion failed due to requirement 'sizeof(char) == sizeof(short)': type size mismatch}} \ // expected-note{{evaluates to '1 == 2'}} \ // ext-warning 3 {{'_Static_assert' is a C11 extension}} -typedef UNION(float, 0.5f) U4; // c-error {{expected a type}} \ - // cxx-error {{type name requires a specifier or qualifier}} \ +typedef UNION(float, 0.5f) U4; // expected-error {{expected a type}} \ // ext-warning 3 {{'_Static_assert' is a C11 extension}} // After defining the assert macro in MS-compatibility mode, we should diff --git a/clang/test/SemaCXX/builtins.cpp b/clang/test/SemaCXX/builtins.cpp index 567094c94c171b..080b4476c7eec1 100644 --- a/clang/test/SemaCXX/builtins.cpp +++ b/clang/test/SemaCXX/builtins.cpp @@ -76,6 +76,11 @@ using ConstMemFnType = int (Dummy::*)() const; void foo() {} +void test_builtin_empty_parentheses_diags() { + __is_trivially_copyable(); // expected-error {{expected a type}} + __is_trivially_copyable(1); // expected-error {{expected a type}} +} + void test_builtin_launder_diags(void *vp, const void *cvp, FnType *fnp, MemFnType mfp, ConstMemFnType cmfp, int (&Arr)[5]) { __builtin_launder(vp); // expected-error {{void pointer argument to '__builtin_launder' is not allowed}} diff --git a/clang/test/SemaCXX/deprecated-builtins.cpp b/clang/test/SemaCXX/deprecated-builtins.cpp index 849b9b014fff25..fafc1da4da13eb 100644 --- a/clang/test/SemaCXX/deprecated-builtins.cpp +++ b/clang/test/SemaCXX/deprecated-builtins.cpp @@ -17,3 +17,8 @@ void f() { a = __has_trivial_destructor(A); // expected-warning-re {{__has_trivial_destructor {{.*}} use __is_trivially_destructible}} } + +void test_builtin_empty_parentheses_diags(void) { + __has_nothrow_copy(); // expected-error {{expected a type}} + __has_nothrow_copy(1); // expected-error {{expected a type}} +} From 6cea7c491f4c4c68aa0494a9b18f36ff40c22c81 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Wed, 17 Apr 2024 12:04:18 -0700 Subject: [PATCH 298/300] [X86] Always use 64-bit relocations in no-PIC large code model (#89101) This matches other types of relocations, e.g. to constant pool. And makes things more consistent with PIC large code model. Some users of the large code model may not place small data in the lower 2GB of the address space (e.g. https://github.com/ClangBuiltLinux/linux/issues/2016), so just unconditionally use 64-bit relocations in the large code model. So now functions in a section not marked large will use 64-bit relocations to reference everything when using the large code model. This also fixes some lldb tests broken by #88172 (https://lab.llvm.org/buildbot/#/builders/68/builds/72458). --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 7 +++---- llvm/test/CodeGen/X86/code-model-elf.ll | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 4e4241efd63d6b..7dcde2a508949d 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2927,11 +2927,10 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, } bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { - // Cannot use 32 bit constants to reference objects in kernel code model. - // Cannot use 32 bit constants to reference objects in large PIC mode since - // GOTOFF is 64 bits. + // Cannot use 32 bit constants to reference objects in kernel/large code + // model. if (TM.getCodeModel() == CodeModel::Kernel || - (TM.getCodeModel() == CodeModel::Large && TM.isPositionIndependent())) + TM.getCodeModel() == CodeModel::Large) return false; // In static codegen with small code model, we can get the address of a label diff --git a/llvm/test/CodeGen/X86/code-model-elf.ll b/llvm/test/CodeGen/X86/code-model-elf.ll index 0da62e3e7a6519..f60f75bc26911e 100644 --- a/llvm/test/CodeGen/X86/code-model-elf.ll +++ b/llvm/test/CodeGen/X86/code-model-elf.ll @@ -350,7 +350,7 @@ define dso_local ptr @lea_forced_small_data() #0 { ; ; LARGE-STATIC-LABEL: lea_forced_small_data: ; LARGE-STATIC: # %bb.0: -; LARGE-STATIC-NEXT: movl $forced_small_data, %eax +; LARGE-STATIC-NEXT: movabsq $forced_small_data, %rax ; LARGE-STATIC-NEXT: retq ; ; SMALL-PIC-LABEL: lea_forced_small_data: @@ -403,7 +403,7 @@ define dso_local i32 @load_forced_small_data() #0 { ; ; LARGE-STATIC-LABEL: load_forced_small_data: ; LARGE-STATIC: # %bb.0: -; LARGE-STATIC-NEXT: movl $forced_small_data+8, %eax +; LARGE-STATIC-NEXT: movabsq $forced_small_data+8, %rax ; LARGE-STATIC-NEXT: movl (%rax), %eax ; LARGE-STATIC-NEXT: retq ; From 1460b4964c7ada2f7536006722c8585b5bd0a1b5 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Wed, 17 Apr 2024 19:11:58 +0000 Subject: [PATCH 299/300] [gn build] Manually port d423d80e560d --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index ee44558a4e9947..bd08ce044c247d 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -37,9 +37,9 @@ if (current_toolchain == default_toolchain) { "_LIBCPP_INSTRUMENTED_WITH_ASAN=", "_LIBCPP_ABI_DEFINES=", "_LIBCPP_HARDENING_MODE_DEFAULT=_LIBCPP_HARDENING_MODE_NONE", - "_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH=", - "_LIBCPP_PSTL_CPU_BACKEND_SERIAL=1", - "_LIBCPP_PSTL_CPU_BACKEND_THREAD=", + "_LIBCPP_PSTL_BACKEND_LIBDISPATCH=", + "_LIBCPP_PSTL_BACKEND_SERIAL=1", + "_LIBCPP_PSTL_BACKEND_STD_THREAD=", ] if (libcxx_abi_version != 1) { values += [ "_LIBCPP_ABI_VERSION=$libcxx_abi_version" ] @@ -143,18 +143,12 @@ if (current_toolchain == default_toolchain) { "__algorithm/pop_heap.h", "__algorithm/prev_permutation.h", "__algorithm/pstl_any_all_none_of.h", - "__algorithm/pstl_backend.h", - "__algorithm/pstl_backends/cpu_backend.h", "__algorithm/pstl_backends/cpu_backends/any_of.h", - "__algorithm/pstl_backends/cpu_backends/backend.h", "__algorithm/pstl_backends/cpu_backends/fill.h", "__algorithm/pstl_backends/cpu_backends/find_if.h", "__algorithm/pstl_backends/cpu_backends/for_each.h", - "__algorithm/pstl_backends/cpu_backends/libdispatch.h", "__algorithm/pstl_backends/cpu_backends/merge.h", - "__algorithm/pstl_backends/cpu_backends/serial.h", "__algorithm/pstl_backends/cpu_backends/stable_sort.h", - "__algorithm/pstl_backends/cpu_backends/thread.h", "__algorithm/pstl_backends/cpu_backends/transform.h", "__algorithm/pstl_backends/cpu_backends/transform_reduce.h", "__algorithm/pstl_copy.h", @@ -664,6 +658,11 @@ if (current_toolchain == default_toolchain) { "__numeric/transform_exclusive_scan.h", "__numeric/transform_inclusive_scan.h", "__numeric/transform_reduce.h", + "__pstl/backends/libdispatch.h", + "__pstl/backends/serial.h", + "__pstl/backends/std_thread.h", + "__pstl/configuration.h", + "__pstl/configuration_fwd.h", "__pstl/cpu_algos/cpu_traits.h", "__random/bernoulli_distribution.h", "__random/binomial_distribution.h", From 9ef6d66b6a534a27b04a8744f8dfb42fcbdb127b Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Wed, 17 Apr 2024 20:14:57 +0000 Subject: [PATCH 300/300] add test, check PGOOpt before using PGOOpt->ColdOptType --- clang/test/CodeGen/pgo-force-function-attrs.ll | 12 ++++++++++++ llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/pgo-force-function-attrs.ll diff --git a/clang/test/CodeGen/pgo-force-function-attrs.ll b/clang/test/CodeGen/pgo-force-function-attrs.ll new file mode 100644 index 00000000000000..3e9ea95e4df410 --- /dev/null +++ b/clang/test/CodeGen/pgo-force-function-attrs.ll @@ -0,0 +1,12 @@ +; RUN: %clang_cc1 -O2 -mllvm -pgo-cold-func-opt=optsize -mllvm -enable-pgo-force-function-attrs -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -emit-llvm -o - | FileCheck %s --check-prefix=OPTSIZE +; Check that no profile means no optsize +; RUN: %clang_cc1 -O2 -mllvm -pgo-cold-func-opt=optsize -mllvm -enable-pgo-force-function-attrs %s -emit-llvm -o - | FileCheck %s --check-prefix=NONE +; Check that no -pgo-cold-func-opt=optsize means no optsize +; RUN: %clang_cc1 -O2 -mllvm -enable-pgo-force-function-attrs -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -emit-llvm -o - | FileCheck %s --check-prefix=NONE + +; NONE-NOT: optsize +; OPTSIZE: optsize + +define void @f() cold { + ret void +} diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 3bb2ce0ae3460b..90ba3b541553e2 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1169,7 +1169,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, if (EnableSyntheticCounts && !PGOOpt) MPM.addPass(SyntheticCountsPropagation()); - if (EnablePGOForceFunctionAttrs) + if (EnablePGOForceFunctionAttrs && PGOOpt) MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType)); MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));